From 5782a7a687441fb398e4ac50ef5bc69fca2526e4 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 26 Dec 2024 11:39:28 +0300 Subject: [PATCH 01/12] Implement the hash table --- src/data.table.h | 19 +++++++++++ src/hash.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 src/hash.c diff --git a/src/data.table.h b/src/data.table.h index ae76a227f..11657e199 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -280,6 +280,25 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); //negate.c SEXP notchin(SEXP x, SEXP table); +// hash.c +typedef struct hash_tab hashtab; +// Allocate, initialise, and return a pointer to the new hash table. +// n is the maximal number of elements that will be inserted. +// load_factor is a real in (0, 1) specifying the desired fraction of used table elements. +// Lower load factors lead to fewer collisions and faster lookups, but waste memory. +// May raise an R error if an allocation fails or a size is out of bounds. +// The table is temporary (allocated via R_alloc()) and will be unprotected upon return from the .Call(). +// See vmaxget()/vmaxset() if you need to unprotect it manually. +hashtab * hash_create_(size_t n, double load_factor); +// Hard-coded "good enough" load_factor +hashtab * hash_create(size_t n); +// Inserts a new key-value pair into the hash, or overwrites an existing value. +// Will raise an R error if inserting more than n elements. +// Don't try to insert a null pointer, nothing good will come out of it. +void hash_set(hashtab *, SEXP key, R_xlen_t value); +// Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound. +R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound); + // functions called from R level .Call/.External and registered in init.c // these now live here to pass -Wstrict-prototypes, #5477 // all arguments must be SEXP since they are called from R level diff --git a/src/hash.c b/src/hash.c new file mode 100644 index 000000000..fb3e0cc05 --- /dev/null +++ b/src/hash.c @@ -0,0 +1,85 @@ +#include "data.table.h" + +struct hash_pair { + SEXP key; + R_xlen_t value; +}; +struct hash_tab { + size_t size, free; + uintptr_t multiplier; + struct hash_pair tb[]; +}; + +hashtab * hash_create(size_t n) { return hash_create_(n, .5); } +// TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio. +static const double hash_multiplier = 0.618033988749895; + +hashtab * hash_create_(size_t n, double load_factor) { + if (load_factor <= 0 || load_factor >= 1) + internal_error("hash_create", "load_factor=%g not in (0, 1)", load_factor); + // precondition: n / load_factor < SIZE_MAX + // truncate to compare in exact integer arithmetic and preserve all bits of n + if ((size_t)(SIZE_MAX * load_factor) <= n) internal_error( + "hash_create", "n=%zu / load_factor=%g would overflow size_t", + n, load_factor + ); + size_t n_full = ceil(n / load_factor); + // precondition: sizeof hashtab + hash_pair[n_full] < SIZE_MAX + // n_full * sizeof hash_pair < SIZE_MAX - sizeof hashtab + // sizeof hash_pair < (SIZE_MAX - sizeof hashtab) / n_full + if (sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error( + "hash_create", "n=%zu with load_factor=%g would overflow total allocation size", + n, load_factor + ); + hashtab * ret = (hashtab *)R_alloc(sizeof(hashtab) + sizeof(struct hash_pair[n_full]), 1); + ret->size = n_full; + ret->free = n; + // To compute floor(size * (A * key % 1)) in integer arithmetic with A < 1, use ((size * A) * key) % size. + ret->multiplier = n_full * hash_multiplier; + // No valid SEXP is a null pointer, so it's a safe marker for empty cells. + for (size_t i = 0; i < n_full; ++i) + ret->tb[i] = (struct hash_pair){.key = NULL, .value = 0}; + return ret; +} + +// Hashing for an open addressing hash table. See Cormen et al., Introduction to Algorithms, 3rd ed., section 11.4. +// This is far from perfect. Make size a prime or a power of two and you'll be able to use double hashing. +static R_INLINE size_t hash_index(SEXP key, uintptr_t multiplier, size_t offset, size_t size) { + // The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size. + // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees, which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes. + return ((((uintptr_t)key) >> 4) * multiplier + offset) % size; +} + +void hash_set(hashtab * h, SEXP key, R_xlen_t value) { + for (size_t i = 0; i < h->size; ++i) { + struct hash_pair * cell = h->tb + hash_index(key, h->multiplier, i, h->size); + if (cell->key == key) { + cell->value = value; + return; + } else if (!cell->key) { + if (!h->free) internal_error( + "hash_insert", "no free slots left (size=%zu after the load factor)", h->size + ); + --h->free; + *cell = (struct hash_pair){.key = key, .value = value}; + return; + } + } + internal_error( + "hash_insert", "did not find a free slot for key %p despite size=%zu, free=%zu", + (void*)key, h->size, h->free + ); +} + +R_xlen_t hash_lookup(const hashtab * h, SEXP key, R_xlen_t ifnotfound) { + for (size_t i = 0; i < h->size; ++i) { + const struct hash_pair * cell = h->tb + hash_index(key, h->multiplier, i, h->size); + if (cell->key == key) { + return cell->value; + } else if (!cell->key) { + return ifnotfound; + } + } + // Should be impossible with a load factor below 1, but just in case: + return ifnotfound; +} From 3846cc931d64fe30a452d7efa5df690687e1bb4a Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 26 Dec 2024 15:32:47 +0300 Subject: [PATCH 02/12] memrecycle(): replace TRUELENGTH marks with a hash --- src/assign.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/src/assign.c b/src/assign.c index b280c2259..7f8e34430 100644 --- a/src/assign.c +++ b/src/assign.c @@ -827,29 +827,18 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con const int nTargetLevels=length(targetLevels), nSourceLevels=length(sourceLevels); const SEXP *targetLevelsD=STRING_PTR_RO(targetLevels), *sourceLevelsD=STRING_PTR_RO(sourceLevels); SEXP newSource = PROTECT(allocVector(INTSXP, length(source))); protecti++; - savetl_init(); + hashtab * marks = hash_create((size_t)nTargetLevels + nSourceLevels); for (int k=0; k0) { - savetl(s); - } else if (tl<0) { - // # nocov start - for (int j=0; j=0) { if (!sourceIsFactor && s==NA_STRING) continue; // don't create NA factor level when assigning character to factor; test 2117 - if (tl>0) savetl(s); - SET_TRUELENGTH(s, -nTargetLevels-(++nAdd)); + hash_set(marks, s, -nTargetLevels-(++nAdd)); } // else, when sourceIsString, it's normal for there to be duplicates here } const int nSource = length(source); @@ -858,37 +847,34 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con const int *sourceD = INTEGER(source); for (int i=0; i Date: Thu, 26 Dec 2024 16:19:06 +0300 Subject: [PATCH 03/12] rbindlist(): replace 1/2 TRUELENGTH with hashing Also avoid crashing when creating a 0-size hash. --- src/hash.c | 3 ++- src/rbindlist.c | 20 ++++++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/hash.c b/src/hash.c index fb3e0cc05..648e628f8 100644 --- a/src/hash.c +++ b/src/hash.c @@ -27,7 +27,8 @@ hashtab * hash_create_(size_t n, double load_factor) { // precondition: sizeof hashtab + hash_pair[n_full] < SIZE_MAX // n_full * sizeof hash_pair < SIZE_MAX - sizeof hashtab // sizeof hash_pair < (SIZE_MAX - sizeof hashtab) / n_full - if (sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error( + // (note that sometimes n is 0) + if (n_full && sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error( "hash_create", "n=%zu with load_factor=%g would overflow total allocation size", n, load_factor ); diff --git a/src/rbindlist.c b/src/rbindlist.c index 42ba9ad74..b33272c1a 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -74,7 +74,10 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor SEXP *uniq = (SEXP *)malloc(upperBoundUniqueNames * sizeof(SEXP)); // upperBoundUniqueNames was initialized with 1 to ensure this is defined (otherwise 0 when no item has names) if (!uniq) error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames); // # nocov - savetl_init(); + R_xlen_t lh = 0; + for (R_xlen_t i=0; i0) savetl(s); + if (hash_lookup(marks, s, 0)<0) continue; // seen this name before uniq[nuniq++] = s; - SET_TRUELENGTH(s,-nuniq); + hash_set(marks, s,-nuniq); } } if (nuniq>0) uniq = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare @@ -99,9 +101,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector if (!counts || !maxdup) { // # nocov start - for (int i=0; i maxdup[u]) maxdup[u] = counts[u]; @@ -134,9 +134,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor int *dupLink = (int *)malloc(ncol * sizeof(int)); // if a colname has occurred before (a dup) links from the 1st to the 2nd time in the final result, 2nd to 3rd, etc if (!colMapRaw || !uniqMap || !dupLink) { // # nocov start - for (int i=0; i Date: Thu, 26 Dec 2024 17:32:53 +0300 Subject: [PATCH 04/12] rbindlist(): replace 2/2 TRUELENGTH with hashing This may likely require a dynamically growing hash of TRUELENGTHs instead of the current pre-allocation approach with a very conservative over-estimate. --- src/hash.c | 4 ++-- src/rbindlist.c | 46 +++++++++++++++++++++++++--------------------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/hash.c b/src/hash.c index 648e628f8..18b4e5e6a 100644 --- a/src/hash.c +++ b/src/hash.c @@ -59,7 +59,7 @@ void hash_set(hashtab * h, SEXP key, R_xlen_t value) { return; } else if (!cell->key) { if (!h->free) internal_error( - "hash_insert", "no free slots left (size=%zu after the load factor)", h->size + "hash_insert", "no free slots left (full size=%zu)", h->size ); --h->free; *cell = (struct hash_pair){.key = key, .value = value}; @@ -67,7 +67,7 @@ void hash_set(hashtab * h, SEXP key, R_xlen_t value) { } } internal_error( - "hash_insert", "did not find a free slot for key %p despite size=%zu, free=%zu", + "hash_insert", "did not find a free slot for key %p; size=%zu, free=%zu", (void*)key, h->size, h->free ); } diff --git a/src/rbindlist.c b/src/rbindlist.c index b33272c1a..7ac3158f2 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -345,7 +345,6 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor if (factor && anyNotStringOrFactor) { // in future warn, or use list column instead ... warning(_("Column %d contains a factor but not all items for the column are character or factor"), idcol+j+1); // some coercing from (likely) integer/numeric to character will be needed. But this coerce can feasibly fail with out-of-memory, so we have to do it up-front - // before the savetl_init() because we have no hook to clean up tl if coerceVector fails. if (coercedForFactor==NULL) { coercedForFactor=PROTECT(allocVector(VECSXP, LENGTH(l))); nprotect++; } for (int i=0; i 0 ? longestLen : 0; + for (R_xlen_t i = 0; i < xlength(l); ++i) { + SEXP li = VECTOR_ELT(l, i); + for (R_xlen_t w = 0; w < xlength(li); ++w) { + SEXP thisCol = VECTOR_ELT(li, w); + SEXP thisColStr = isFactor(thisCol) ? getAttrib(thisCol, R_LevelsSymbol) : thisCol; + hl += xlength(thisColStr); + } + } + hashtab * marks = hash_create(hl); int nLevel=0, allocLevel=0; SEXP *levelsRaw = NULL; // growing list of SEXP pointers. Raw since managed with raw realloc. if (orderedFactor) { @@ -375,14 +385,12 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor nLevel = allocLevel = longestLen; levelsRaw = (SEXP *)malloc(nLevel * sizeof(SEXP)); if (!levelsRaw) { - savetl_end(); // # nocov error(_("Failed to allocate working memory for %d ordered factor levels of result column %d"), nLevel, idcol+j+1); // # nocov } for (int k=0; k0) savetl(s); levelsRaw[k] = s; - SET_TRUELENGTH(s,-k-1); + hash_set(marks, s, -k-1); } for (int i=0; i=last) { // if tl>=0 then also tl>=last because last<=0 if (tl>=0) { snprintf(warnStr, 1000, // not direct warning as we're inside tl region @@ -434,8 +442,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor for (int k=0; k0) savetl(s); + hash_lookup(marks, s, 0)<0) continue; // seen this level before; handles removing dups from levels as well as finding unique of character columns if (allocLevel==nLevel) { // including initial time when allocLevel==nLevel==0 SEXP *tt = NULL; if (allocLevel Date: Thu, 26 Dec 2024 17:45:14 +0300 Subject: [PATCH 05/12] chmatchMain(): replace TRUELENGTH marks with hash --- src/chmatch.c | 45 ++++++++++----------------------------------- 1 file changed, 10 insertions(+), 35 deletions(-) diff --git a/src/chmatch.c b/src/chmatch.c index 46e68c1d8..ac3851b1f 100644 --- a/src/chmatch.c +++ b/src/chmatch.c @@ -36,8 +36,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch } // Since non-ASCII strings may be marked with different encodings, it only make sense to compare // the bytes under a same encoding (UTF-8) #3844 #3850. - // Not 'const' because we might SET_TRUELENGTH() below. - SEXP *xd; + const SEXP *xd; if (isSymbol(x)) { xd = &sym; } else { @@ -56,33 +55,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch return ans; } // else xlen>1; nprotect is const above since no more R allocations should occur after this point - savetl_init(); - for (int i=0; i0) { - savetl(s); // R's internal hash (which is positive); save it - SET_TRUELENGTH(s,0); - } else if (tl<0) { - // R 2.14.0+ initializes truelength to 0 (before that it was uninitialized/random). - // Now that data.table depends on R 3.1.0+, that is after 2.14.0 too. - // We rely on that 0-initialization, and that R's internal hash is positive. - // # nocov start - savetl_end(); - internal_error(__func__, "CHARSXP '%s' has a negative truelength (%d)", CHAR(s), tl); // # nocov - // # nocov end - } - } + hashtab * marks = hash_create(tablelen); int nuniq=0; for (int i=0; i0) { savetl(s); tl=0; } - if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table + int tl = hash_lookup(marks, s, 0); + if (tl==0) hash_set(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table } // in future if we need NAs in x not to be matched to NAs in table ... - // if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0) - // SET_TRUELENGTH(NA_STRING, 0); if (chmatchdup) { // chmatchdup() is basically base::pmatch() but without the partial matching part. For example : // chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table' @@ -101,21 +81,19 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch if (!counts || !map) { // # nocov start free(counts); free(map); - for (int i=0; i Date: Thu, 26 Dec 2024 18:01:34 +0300 Subject: [PATCH 06/12] copySharedColumns(): hash instead of TRUELENGTH --- src/utils.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/src/utils.c b/src/utils.c index 7f529e943..0c9d23fe6 100644 --- a/src/utils.c +++ b/src/utils.c @@ -261,36 +261,22 @@ void copySharedColumns(SEXP x) { const int ncol = length(x); if (!isNewList(x) || ncol==1) return; bool *shared = (bool *)R_alloc(ncol, sizeof(bool)); // on R heap in case alloc fails - int *savetl = (int *)R_alloc(ncol, sizeof(int)); // on R heap for convenience but could be a calloc + hashtab * marks = hash_create(ncol); const SEXP *xp = SEXPPTR_RO(x); - // first save the truelength, which may be negative on specials in dogroups, and set to zero; test 2157 - // the savetl() function elsewhere is for CHARSXP. Here, we are using truelength on atomic vectors. - for (int i=0; i Date: Thu, 26 Dec 2024 18:08:01 +0300 Subject: [PATCH 07/12] combineFactorLevels(): hash instead of TRUELENGTH --- src/fmelt.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index 59e82455b..fa076ebc8 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -383,9 +383,8 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna } static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType, Rboolean * isRowOrdered) -// Finds unique levels directly in one pass with no need to create hash tables. Creates integer factor -// too in the same single pass. Previous version called factor(x, levels=unique) where x was type character -// and needed hash table. +// Finds unique levels directly in one pass. Creates integer factor too in the same single pass. Previous +// version called factor(x, levels=unique) where x was type character. // TODO keep the original factor columns as factor and use new technique in rbindlist.c. The calling // environments are a little difference hence postponed for now (e.g. rbindlist calls writeNA which // a general purpose combiner would need to know how many to write) @@ -404,8 +403,10 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType SEXP *levelsRaw = (SEXP *)R_alloc(maxlevels, sizeof(SEXP)); // allocate for worst-case all-unique levels int *ansd = INTEGER(ans); const SEXP *targetd = STRING_PTR_RO(target); - savetl_init(); - // no alloc or any fail point until savetl_end() + R_xlen_t hl = 0; + for (R_xlen_t i = 0; i < nitem; ++i) + hl += xlength(VECTOR_ELT(factorLevels, i)); + hashtab * marks = hash_create(hl); int nlevel=0; for (int i=0; i0) savetl(s); - SET_TRUELENGTH(s,-(++nlevel)); + hash_set(marks,s,-(++nlevel)); levelsRaw[nlevel-1] = s; } } @@ -425,13 +425,11 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType if (targetd[i]==NA_STRING) { *ansd++ = NA_INTEGER; } else { - int tl = TRUELENGTH(targetd[i]); + int tl = hash_lookup(marks,targetd[i],0); *ansd++ = tl<0 ? -tl : NA_INTEGER; } } - for (int i=0; i Date: Thu, 26 Dec 2024 18:37:11 +0300 Subject: [PATCH 08/12] anySpecialStatic(): hash instead of TRUELENGTH --- src/dogroups.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index 869dfd2d6..c2f84e9f7 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,7 +3,7 @@ #include #include -static bool anySpecialStatic(SEXP x) { +static bool anySpecialStatic(SEXP x, hashtab * specials) { // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups // creates .SD for the largest group once up front, overwriting the contents for each group. Their @@ -46,16 +46,16 @@ static bool anySpecialStatic(SEXP x) { if (n==0) return false; if (isVectorAtomic(x)) - return ALTREP(x) || TRUELENGTH(x)<0; + return ALTREP(x) || hash_lookup(specials, x, 0)<0; if (isNewList(x)) { - if (TRUELENGTH(x)<0) + if (hash_lookup(specials, x, 0)<0) return true; // test 2158 for (int i=0; i 1; // showProgress only if more than 1 group double startTime = (showProgress) ? wallclock() : 0; // For progress printing, startTime is set at the beginning double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress + hashtab * specials = hash_create(3 + ngrpcols + xlength(SDall)); // .I, .N, .GRP plus columns of .BY plus SDall + defineVar(sym_BY, BY = PROTECT(allocVector(VECSXP, ngrpcols)), env); nprotect++; // PROTECT for rchk SEXP bynames = PROTECT(allocVector(STRSXP, ngrpcols)); nprotect++; // TO DO: do we really need bynames, can we assign names afterwards in one step? for (int i=0; i maxGrpSize) maxGrpSize = ilens[i]; } defineVar(install(".I"), I = PROTECT(allocVector(INTSXP, maxGrpSize)), env); nprotect++; - SET_TRUELENGTH(I, -maxGrpSize); // marker for anySpecialStatic(); see its comments + hash_set(specials, I, -maxGrpSize); // marker for anySpecialStatic(); see its comments R_LockBinding(install(".I"), env); SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by" @@ -150,7 +152,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX nameSyms[i] = install(CHAR(STRING_ELT(names, i))); // fixes http://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by copyMostAttrib(VECTOR_ELT(dt,INTEGER(dtcols)[i]-1), this); // not names, otherwise test 778 would fail - SET_TRUELENGTH(this, -maxGrpSize); // marker for anySpecialStatic(); see its comments + hash_set(specials, this, -maxGrpSize); // marker for anySpecialStatic(); see its comments } SEXP xknames = PROTECT(getAttrib(xSD, R_NamesSymbol)); nprotect++; @@ -329,7 +331,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX copyMostAttrib(RHS, target); // attributes of first group dominate; e.g. initial factor levels come from first group } bool copied = false; - if (isNewList(target) && anySpecialStatic(RHS)) { // see comments in anySpecialStatic() + if (isNewList(target) && anySpecialStatic(RHS, specials)) { // see comments in anySpecialStatic() RHS = PROTECT(copyAsPlain(RHS)); copied = true; } @@ -435,7 +437,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn); } bool copied = false; - if (isNewList(target) && anySpecialStatic(source)) { // see comments in anySpecialStatic() + if (isNewList(target) && anySpecialStatic(source, specials)) { // see comments in anySpecialStatic() source = PROTECT(copyAsPlain(source)); copied = true; } @@ -485,12 +487,6 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } SETLENGTH(I, maxGrpSize); SET_TRUELENGTH(I, maxGrpSize); - for (int i=0; i0; From 962b27256592d5dbce745c559de0efaabcf1e091 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sat, 28 Dec 2024 13:42:08 +0300 Subject: [PATCH 09/12] forder(): hash instead of TRUELENGTH The hash needs O(n) memory (actually 2*n/load_factor entries) which isn't great. --- src/forder.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/src/forder.c b/src/forder.c index 8a735e0e8..0b5a72e4e 100644 --- a/src/forder.c +++ b/src/forder.c @@ -64,15 +64,12 @@ static char msg[1001]; #undef warning #define warning(...) Do not use warning in this file // since it can be turned to error via warn=2 /* Using OS realloc() in this file to benefit from (often) in-place realloc() to save copy - * We have to trap on exit anyway to call savetl_end(). * NB: R_alloc() would be more convenient (fails within) and robust (auto free) but there is no R_realloc(). Implementing R_realloc() would be an alloc and copy, iiuc. * R_Calloc/R_Realloc needs to be R_Free'd, even before error() [R-exts$6.1.2]. An oom within R_Calloc causes a previous R_Calloc to leak so R_Calloc would still needs to be trapped anyway. * Therefore, using <> approach to cleanup() on error. */ static void free_ustr(void) { - for(int i=0; i0 (R's usage) are stored there. } void internal_error_with_cleanup(const char *call_name, const char *format, ...) { @@ -291,14 +287,13 @@ static void cradix(SEXP *x, int n) free(cradix_xtmp); cradix_xtmp=NULL; } -static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8) +static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8, hashtab * marks) // group numbers are left in truelength to be fetched by WRITE_KEY { int na_count=0; bool anynotascii=false, anynotutf8=false; if (ustr_n!=0) internal_error_with_cleanup(__func__, "ustr isn't empty when starting range_str: ustr_n=%d, ustr_alloc=%d", ustr_n, ustr_alloc); // # nocov if (ustr_maxlen!=0) internal_error_with_cleanup(__func__, "ustr_maxlen isn't 0 when starting range_str"); // # nocov - // savetl_init() has already been called at the start of forder #pragma omp parallel for num_threads(getDTthreads(n, true)) for(int i=0; i=0) { // another thread may have set it while I was waiting, so check it again - if (TRUELENGTH(s)>0) // save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore - savetl(s); // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much. - // now save unique SEXP in ustr so i) we can loop through them afterwards and reset TRUELENGTH to 0 and ii) sort uniques when sorting too + if (hash_lookup(marks,s,0)>=0) { // another thread may have set it while I was waiting, so check it again + // now save unique SEXP in ustr so we can loop sort uniques when sorting too if (ustr_alloc<=ustr_n) { ustr_alloc = (ustr_alloc==0) ? 16384 : ustr_alloc*2; // small initial guess, negligible time to alloc 128KB (32 pages) if (ustr_alloc>n) ustr_alloc = n; // clamp at n. Reaches n when fully unique (no dups) @@ -320,7 +317,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max if (ustr==NULL) STOP(_("Unable to realloc %d * %d bytes in range_str"), ustr_alloc, (int)sizeof(SEXP)); // # nocov } ustr[ustr_n++] = s; - SET_TRUELENGTH(s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group + hash_set(marks, s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s); if (!anynotutf8 && // even if anynotascii we still want to know if anynotutf8, and anynotutf8 implies anynotascii already !IS_ASCII(s)) { // anynotutf8 implies anynotascii and IS_ASCII will be cheaper than IS_UTF8, so start with this one @@ -351,23 +348,22 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max for (int i=0; iustr_maxlen) ustr_maxlen=LENGTH(s); - if (TRUELENGTH(s)>0) savetl(s); } cradix(ustr3, ustr_n); // sort to detect possible duplicates after converting; e.g. two different non-utf8 map to the same utf8 - SET_TRUELENGTH(ustr3[0], -1); + hash_set(marks, ustr3[0], -1); int o = -1; for (int i=1; i Date: Sat, 28 Dec 2024 13:47:26 +0300 Subject: [PATCH 10/12] Remove savetl() --- src/assign.c | 56 ------------------------------------------------ src/data.table.h | 1 - 2 files changed, 57 deletions(-) diff --git a/src/assign.c b/src/assign.c index 7f8e34430..a7b083cb8 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1256,62 +1256,6 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) { return(v); } -static SEXP *saveds=NULL; -static R_len_t *savedtl=NULL, nalloc=0, nsaved=0; - -void savetl_init(void) { - if (nsaved || nalloc || saveds || savedtl) { - internal_error(__func__, _("savetl_init checks failed (%d %d %p %p)"), nsaved, nalloc, (void *)saveds, (void *)savedtl); // # nocov - } - nsaved = 0; - nalloc = 100; - saveds = (SEXP *)malloc(nalloc * sizeof(SEXP)); - savedtl = (R_len_t *)malloc(nalloc * sizeof(R_len_t)); - if (!saveds || !savedtl) { - free(saveds); free(savedtl); // # nocov - savetl_end(); // # nocov - error(_("Failed to allocate initial %d items in savetl_init"), nalloc); // # nocov - } -} - -void savetl(SEXP s) -{ - if (nsaved==nalloc) { - if (nalloc==INT_MAX) { - savetl_end(); // # nocov - internal_error(__func__, "reached maximum %d items for savetl", nalloc); // # nocov - } - nalloc = nalloc>(INT_MAX/2) ? INT_MAX : nalloc*2; - char *tmp = (char *)realloc(saveds, nalloc*sizeof(SEXP)); - if (tmp==NULL) { - // C spec states that if realloc() fails the original block is left untouched; it is not freed or moved. We rely on that here. - savetl_end(); // # nocov free(saveds) happens inside savetl_end - error(_("Failed to realloc saveds to %d items in savetl"), nalloc); // # nocov - } - saveds = (SEXP *)tmp; - tmp = (char *)realloc(savedtl, nalloc*sizeof(R_len_t)); - if (tmp==NULL) { - savetl_end(); // # nocov - error(_("Failed to realloc savedtl to %d items in savetl"), nalloc); // # nocov - } - savedtl = (R_len_t *)tmp; - } - saveds[nsaved] = s; - savedtl[nsaved] = TRUELENGTH(s); - nsaved++; -} - -void savetl_end(void) { - // Can get called if nothing has been saved yet (nsaved==0), or even if _init() hasn't been called yet (pointers NULL). Such - // as to clear up before error. Also, it might be that nothing needed to be saved anyway. - for (int i=0; i Date: Sun, 29 Dec 2024 11:41:07 +0300 Subject: [PATCH 11/12] Add codecov suppressions --- src/hash.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hash.c b/src/hash.c index 18b4e5e6a..9484d175f 100644 --- a/src/hash.c +++ b/src/hash.c @@ -16,7 +16,7 @@ static const double hash_multiplier = 0.618033988749895; hashtab * hash_create_(size_t n, double load_factor) { if (load_factor <= 0 || load_factor >= 1) - internal_error("hash_create", "load_factor=%g not in (0, 1)", load_factor); + internal_error("hash_create", "load_factor=%g not in (0, 1)", load_factor); // # nocov // precondition: n / load_factor < SIZE_MAX // truncate to compare in exact integer arithmetic and preserve all bits of n if ((size_t)(SIZE_MAX * load_factor) <= n) internal_error( @@ -66,7 +66,7 @@ void hash_set(hashtab * h, SEXP key, R_xlen_t value) { return; } } - internal_error( + internal_error( // # nocov "hash_insert", "did not find a free slot for key %p; size=%zu, free=%zu", (void*)key, h->size, h->free ); @@ -82,5 +82,5 @@ R_xlen_t hash_lookup(const hashtab * h, SEXP key, R_xlen_t ifnotfound) { } } // Should be impossible with a load factor below 1, but just in case: - return ifnotfound; + return ifnotfound; // # nocov } From d7a9a1707ec94ec4f2bd86a5dfb5609207029ba4 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Wed, 1 Jan 2025 15:03:38 +0300 Subject: [PATCH 12/12] Dynamically grow the hash table with bound unknown In forder() and rbindlist(), there is no good upper boundary on the number of elements in the hash known ahead of time. Grow the hash table dynamically. Since the R/W locks are far too slow and OpenMP atomics are too limited, rely on strategically placed flushes, which isn't really a solution. --- src/data.table.h | 12 +++- src/forder.c | 35 +++++----- src/hash.c | 169 ++++++++++++++++++++++++++++++++++++++++++----- src/rbindlist.c | 50 +++++--------- 4 files changed, 198 insertions(+), 68 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 252f5e3b5..f9e502be8 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -283,13 +283,10 @@ SEXP notchin(SEXP x, SEXP table); typedef struct hash_tab hashtab; // Allocate, initialise, and return a pointer to the new hash table. // n is the maximal number of elements that will be inserted. -// load_factor is a real in (0, 1) specifying the desired fraction of used table elements. // Lower load factors lead to fewer collisions and faster lookups, but waste memory. // May raise an R error if an allocation fails or a size is out of bounds. // The table is temporary (allocated via R_alloc()) and will be unprotected upon return from the .Call(). // See vmaxget()/vmaxset() if you need to unprotect it manually. -hashtab * hash_create_(size_t n, double load_factor); -// Hard-coded "good enough" load_factor hashtab * hash_create(size_t n); // Inserts a new key-value pair into the hash, or overwrites an existing value. // Will raise an R error if inserting more than n elements. @@ -298,6 +295,15 @@ void hash_set(hashtab *, SEXP key, R_xlen_t value); // Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound. R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound); +// The dynamically-allocated hash table has a public field for the R protection wrapper. +// Keep it PROTECTed while the table is in use. +typedef struct dhash_tab { + SEXP prot; +} dhashtab; +dhashtab * dhash_create(size_t n); +void dhash_set(dhashtab * h, SEXP key, R_xlen_t value); +R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound); + // functions called from R level .Call/.External and registered in init.c // these now live here to pass -Wstrict-prototypes, #5477 // all arguments must be SEXP since they are called from R level diff --git a/src/forder.c b/src/forder.c index 0b5a72e4e..c62f562b0 100644 --- a/src/forder.c +++ b/src/forder.c @@ -287,7 +287,7 @@ static void cradix(SEXP *x, int n) free(cradix_xtmp); cradix_xtmp=NULL; } -static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8, hashtab * marks) +static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8, dhashtab * marks) // group numbers are left in truelength to be fetched by WRITE_KEY { int na_count=0; @@ -302,13 +302,13 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max na_count++; continue; } - // Why is it acceptable to call hash_lookup when marks can be shared between threads? - // 1. There are no pointers to follow for hash_lookup() inside the hash table, so there's no danger of crashing by following a partially written pointer. - // 2. If another thread writes s into the hash but hash_lookup() fails to see a non-zero value, we'll safely check it again in the critical section below. + // Why is it acceptable to call dhash_lookup when marks can be shared between threads? + // 1. We have rwlocks to avoid crashing on a pointer being invalidated by a different thread. + // 2. We check again after entering the critical section. // 3. We only change the marks from zero to nonzero, so once a nonzero value is seen, it must be correct. - if (hash_lookup(marks,s,0)<0) continue; // seen this group before - #pragma omp critical - if (hash_lookup(marks,s,0)>=0) { // another thread may have set it while I was waiting, so check it again + if (dhash_lookup(marks,s,0)<0) continue; // seen this group before + #pragma omp critical(range_str_write) + if (dhash_lookup(marks,s,0)>=0) { // another thread may have set it while I was waiting, so check it again // now save unique SEXP in ustr so we can loop sort uniques when sorting too if (ustr_alloc<=ustr_n) { ustr_alloc = (ustr_alloc==0) ? 16384 : ustr_alloc*2; // small initial guess, negligible time to alloc 128KB (32 pages) @@ -317,7 +317,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max if (ustr==NULL) STOP(_("Unable to realloc %d * %d bytes in range_str"), ustr_alloc, (int)sizeof(SEXP)); // # nocov } ustr[ustr_n++] = s; - hash_set(marks, s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group + dhash_set(marks, s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s); if (!anynotutf8 && // even if anynotascii we still want to know if anynotutf8, and anynotutf8 implies anynotascii already !IS_ASCII(s)) { // anynotutf8 implies anynotascii and IS_ASCII will be cheaper than IS_UTF8, so start with this one @@ -350,20 +350,20 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s); } cradix(ustr3, ustr_n); // sort to detect possible duplicates after converting; e.g. two different non-utf8 map to the same utf8 - hash_set(marks, ustr3[0], -1); + dhash_set(marks, ustr3[0], -1); int o = -1; for (int i=1; iprot); n_protect++; range_str(STRING_PTR_RO(x), nrow, &min, &max, &na_count, &anynotascii, &anynotutf8, marks); break; default: @@ -763,7 +764,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA if (nalast==-1) anso[i]=0; elem = naval; } else { - elem = -hash_lookup(marks, xd[i], 0); + elem = -dhash_lookup(marks, xd[i], 0); } WRITE_KEY }} diff --git a/src/hash.c b/src/hash.c index 9484d175f..a02c531ea 100644 --- a/src/hash.c +++ b/src/hash.c @@ -1,3 +1,5 @@ +#include + #include "data.table.h" struct hash_pair { @@ -10,28 +12,33 @@ struct hash_tab { struct hash_pair tb[]; }; -hashtab * hash_create(size_t n) { return hash_create_(n, .5); } // TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio. static const double hash_multiplier = 0.618033988749895; -hashtab * hash_create_(size_t n, double load_factor) { +static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) { if (load_factor <= 0 || load_factor >= 1) - internal_error("hash_create", "load_factor=%g not in (0, 1)", load_factor); // # nocov + internal_error(__func__, "load_factor=%g not in (0, 1)", load_factor); // # nocov // precondition: n / load_factor < SIZE_MAX - // truncate to compare in exact integer arithmetic and preserve all bits of n - if ((size_t)(SIZE_MAX * load_factor) <= n) internal_error( - "hash_create", "n=%zu / load_factor=%g would overflow size_t", - n, load_factor + // this is implemented a bit stricter than needed and would fail some almost-too-high sizes + // due to the size_t -> double conversion + if ((size_t)((double)SIZE_MAX * load_factor) <= n_elements) internal_error( + __func__, "n=%zu / load_factor=%g would overflow size_t", + n_elements, load_factor ); - size_t n_full = ceil(n / load_factor); + return ceil(n_elements / load_factor); +} + +static hashtab * hash_create_(size_t n, double load_factor) { + size_t n_full = get_full_size(n, load_factor); // precondition: sizeof hashtab + hash_pair[n_full] < SIZE_MAX // n_full * sizeof hash_pair < SIZE_MAX - sizeof hashtab // sizeof hash_pair < (SIZE_MAX - sizeof hashtab) / n_full // (note that sometimes n is 0) - if (n_full && sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error( - "hash_create", "n=%zu with load_factor=%g would overflow total allocation size", - n, load_factor - ); + if (n_full && sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) + internal_error( + __func__, "n=%zu with load_factor=%g would overflow total allocation size", + n, load_factor + ); hashtab * ret = (hashtab *)R_alloc(sizeof(hashtab) + sizeof(struct hash_pair[n_full]), 1); ret->size = n_full; ret->free = n; @@ -39,15 +46,18 @@ hashtab * hash_create_(size_t n, double load_factor) { ret->multiplier = n_full * hash_multiplier; // No valid SEXP is a null pointer, so it's a safe marker for empty cells. for (size_t i = 0; i < n_full; ++i) - ret->tb[i] = (struct hash_pair){.key = NULL, .value = 0}; + ret->tb[i].key = NULL; return ret; } +hashtab * hash_create(size_t n) { return hash_create_(n, .5); } + // Hashing for an open addressing hash table. See Cormen et al., Introduction to Algorithms, 3rd ed., section 11.4. // This is far from perfect. Make size a prime or a power of two and you'll be able to use double hashing. static R_INLINE size_t hash_index(SEXP key, uintptr_t multiplier, size_t offset, size_t size) { // The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size. - // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees, which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes. + // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees, + // which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes. return ((((uintptr_t)key) >> 4) * multiplier + offset) % size; } @@ -59,7 +69,7 @@ void hash_set(hashtab * h, SEXP key, R_xlen_t value) { return; } else if (!cell->key) { if (!h->free) internal_error( - "hash_insert", "no free slots left (full size=%zu)", h->size + __func__, "no free slots left (full size=%zu)", h->size ); --h->free; *cell = (struct hash_pair){.key = key, .value = value}; @@ -67,7 +77,7 @@ void hash_set(hashtab * h, SEXP key, R_xlen_t value) { } } internal_error( // # nocov - "hash_insert", "did not find a free slot for key %p; size=%zu, free=%zu", + __func__, "did not find a free slot for key %p; size=%zu, free=%zu", (void*)key, h->size, h->free ); } @@ -84,3 +94,130 @@ R_xlen_t hash_lookup(const hashtab * h, SEXP key, R_xlen_t ifnotfound) { // Should be impossible with a load factor below 1, but just in case: return ifnotfound; // # nocov } + +typedef struct dhashtab_ { + dhashtab public; // must be at offset 0 + size_t size, used, limit; + uintptr_t multiplier; + struct hash_pair *table, *previous; +} dhashtab_; + +static void dhash_finalizer(SEXP dhash) { + dhashtab_ * self = R_ExternalPtrAddr(dhash); + if (!self) return; + R_ClearExternalPtr(dhash); + free(self->previous); + free(self->table); + free(self); +} + +static struct hash_pair * dhash_allocate(size_t n_full) { + if (n_full > SIZE_MAX / sizeof(struct hash_pair)) + internal_error(__func__, "%zu hash table slots would overflow size_t", n_full); // # nocov + struct hash_pair * new = malloc(sizeof(struct hash_pair[n_full])); + if (!new) internal_error(__func__, "failed to malloc() %zu hash table slots", n_full); // # nocov + for (size_t i = 0; i < n_full; ++i) new[i] = (struct hash_pair){.key = NULL}; + return new; +} + +static dhashtab * dhash_create_(size_t n, double load_factor) { + size_t n_full = get_full_size(n, load_factor); + + SEXP prot = PROTECT(R_MakeExternalPtr(NULL, R_NilValue, R_NilValue)); + R_RegisterCFinalizerEx(prot, dhash_finalizer, TRUE); + dhashtab_ * self = malloc(sizeof(dhashtab_)); + if (!self) internal_error(__func__, "failed to malloc() the hash table header"); // # nocov + *self = (dhashtab_){ + .public = { .prot = prot }, + }; + R_SetExternalPtrAddr(prot, self); + + self->table = dhash_allocate(n_full); + self->size = n_full; + self->limit = n; + self->multiplier = n_full * hash_multiplier; + // this is the last time we're allowed to set the table parts piece by piece + + UNPROTECT(1); + return &self->public; +} + +dhashtab * dhash_create(size_t n) { return dhash_create_(n, .5); } + +static void dhash_enlarge(dhashtab_ * self) { + if (self->size > SIZE_MAX / 2) + internal_error(__func__, "doubling %zu elements would overflow size_t", self->size); // # nocov + size_t new_size = self->size * 2; + struct hash_pair * new = dhash_allocate(new_size); + uintptr_t new_multiplier = new_size * hash_multiplier; + for (size_t i = 0; i < self->size; ++i) { + for (size_t j = 0; j < new_size; ++j) { + size_t ii = hash_index(self->table[i].key, new_multiplier, j, new_size); + if (!new[ii].key) { + new[ii] = (struct hash_pair){ + .key = self->table[i].key, + .value = self->table[i].value + }; + break; + } + } + } + // Not trying to protect from calls to _set -> _enlarge from other threads! + // Writes only come from a critical section, so two threads will not attempt to enlarge at the same time. + // What we have to prevent is yanking the self->table from under a different thread reading it right now. + free(self->previous); + struct hash_pair * previous = self->table; + dhashtab public = self->public; + size_t used = self->used, limit = self->limit*2; + *self = (dhashtab_){ + .public = public, + .size = new_size, + .used = used, + .limit = limit, + .multiplier = new_multiplier, + .table = new, + .previous = previous, + }; + #pragma omp flush // no locking or atomic access! this is bad +} + +void dhash_set(dhashtab * h, SEXP key, R_xlen_t value) { + dhashtab_ * self = (dhashtab_ *)h; +again: + for (size_t i = 0; i < self->size; ++i) { + struct hash_pair * cell = self->table + hash_index(key, self->multiplier, i, self->size); + if (cell->key == key) { + cell->value = value; + return; + } else if (!cell->key) { + if (self->used < self->limit) { + *cell = (struct hash_pair){ .key = key, .value = value }; + ++self->used; + return; + } + dhash_enlarge(self); + goto again; // won't be needed next time with the limit doubled + } + } + internal_error( // # nocov + __func__, "did not find a free slot for key %p; size=%zu, used=%zu, limit=%zu", + (void*)key, self->size, self->used, self->limit + ); +} + +R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound) { + #pragma omp flush // no locking or atomic access! this is bad + dhashtab_ self = *(dhashtab_ *)h; + R_xlen_t ret = ifnotfound; + for (size_t i = 0; i < self.size; ++i) { + const struct hash_pair * cell = self.table + hash_index(key, self.multiplier, i, self.size); + if (cell->key == key) { + ret = cell->value; + goto done; + } else if (!cell->key) { + goto done; + } + } +done: + return ret; +} diff --git a/src/rbindlist.c b/src/rbindlist.c index 7ac3158f2..99746b2e9 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -74,10 +74,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor SEXP *uniq = (SEXP *)malloc(upperBoundUniqueNames * sizeof(SEXP)); // upperBoundUniqueNames was initialized with 1 to ensure this is defined (otherwise 0 when no item has names) if (!uniq) error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames); // # nocov - R_xlen_t lh = 0; - for (R_xlen_t i=0; i0) uniq = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare @@ -116,7 +113,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor memset(counts, 0, nuniq*sizeof(int)); for (int j=0; j maxdup[u]) maxdup[u] = counts[u]; @@ -155,7 +152,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor memset(counts, 0, nuniq*sizeof(int)); for (int j=0; j 0 ? longestLen : 0; - for (R_xlen_t i = 0; i < xlength(l); ++i) { - SEXP li = VECTOR_ELT(l, i); - for (R_xlen_t w = 0; w < xlength(li); ++w) { - SEXP thisCol = VECTOR_ELT(li, w); - SEXP thisColStr = isFactor(thisCol) ? getAttrib(thisCol, R_LevelsSymbol) : thisCol; - hl += xlength(thisColStr); - } - } - hashtab * marks = hash_create(hl); + dhashtab * marks = dhash_create(1024); int nLevel=0, allocLevel=0; SEXP *levelsRaw = NULL; // growing list of SEXP pointers. Raw since managed with raw realloc. if (orderedFactor) { @@ -390,7 +376,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor for (int k=0; k=last) { // if tl>=0 then also tl>=last because last<=0 if (tl>=0) { snprintf(warnStr, 1000, // not direct warning as we're inside tl region @@ -442,7 +428,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor for (int k=0; k