Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start replacing TRUELENGTH markers with a hash #6694

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
89 changes: 9 additions & 80 deletions src/assign.c
Original file line number Diff line number Diff line change
Expand Up @@ -827,29 +827,18 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
const int nTargetLevels=length(targetLevels), nSourceLevels=length(sourceLevels);
const SEXP *targetLevelsD=STRING_PTR_RO(targetLevels), *sourceLevelsD=STRING_PTR_RO(sourceLevels);
SEXP newSource = PROTECT(allocVector(INTSXP, length(source))); protecti++;
savetl_init();
hashtab * marks = hash_create((size_t)nTargetLevels + nSourceLevels);
for (int k=0; k<nTargetLevels; ++k) {
const SEXP s = targetLevelsD[k];
const int tl = TRUELENGTH(s);
if (tl>0) {
savetl(s);
} else if (tl<0) {
// # nocov start
for (int j=0; j<k; ++j) SET_TRUELENGTH(s, 0); // wipe our negative usage and restore 0
savetl_end(); // then restore R's own usage (if any)
internal_error(__func__, "levels of target are either not unique or have truelength<0"); // # nocov
// # nocov end
}
SET_TRUELENGTH(s, -k-1);
hash_set(marks, s, -k-1);
}
int nAdd = 0;
for (int k=0; k<nSourceLevels; ++k) {
const SEXP s = sourceLevelsD[k];
const int tl = TRUELENGTH(s);
const int tl = hash_lookup(marks, s, 0);
if (tl>=0) {
if (!sourceIsFactor && s==NA_STRING) continue; // don't create NA factor level when assigning character to factor; test 2117
if (tl>0) savetl(s);
SET_TRUELENGTH(s, -nTargetLevels-(++nAdd));
hash_set(marks, s, -nTargetLevels-(++nAdd));
} // else, when sourceIsString, it's normal for there to be duplicates here
}
const int nSource = length(source);
Expand All @@ -858,45 +847,41 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
const int *sourceD = INTEGER(source);
for (int i=0; i<nSource; ++i) { // convert source integers to refer to target levels
const int val = sourceD[i];
newSourceD[i] = val==NA_INTEGER ? NA_INTEGER : -TRUELENGTH(sourceLevelsD[val-1]); // retains NA factor levels here via TL(NA_STRING); e.g. ordered factor
newSourceD[i] = val==NA_INTEGER ? NA_INTEGER : -hash_lookup(marks, sourceLevelsD[val-1], 0); // retains NA factor levels here via TL(NA_STRING); e.g. ordered factor
}
} else {
const SEXP *sourceD = STRING_PTR_RO(source);
for (int i=0; i<nSource; ++i) { // convert source integers to refer to target levels
const SEXP val = sourceD[i];
newSourceD[i] = val==NA_STRING ? NA_INTEGER : -TRUELENGTH(val);
newSourceD[i] = val==NA_STRING ? NA_INTEGER : -hash_lookup(marks, val, 0);
}
}
source = newSource;
for (int k=0; k<nTargetLevels; ++k) SET_TRUELENGTH(targetLevelsD[k], 0); // don't need those anymore
for (int k=0; k<nTargetLevels; ++k) hash_set(marks, targetLevelsD[k], 0); // don't need those anymore
if (nAdd) {
// cannot grow the levels yet as that would be R call which could fail to alloc and we have no hook to clear up
SEXP *temp = (SEXP *)malloc(nAdd * sizeof(SEXP *));
if (!temp) {
// # nocov start
for (int k=0; k<nSourceLevels; ++k) SET_TRUELENGTH(sourceLevelsD[k], 0);
savetl_end();
error(_("Unable to allocate working memory of %zu bytes to combine factor levels"), nAdd*sizeof(SEXP *));
// # nocov end
}
for (int k=0, thisAdd=0; thisAdd<nAdd; ++k) { // thisAdd<nAdd to stop early when the added ones are all reached
SEXP s = sourceLevelsD[k];
int tl = TRUELENGTH(s);
int tl = hash_lookup(marks, s, 0);
if (tl) { // tl negative here
if (tl != -nTargetLevels-thisAdd-1) internal_error(__func__, "extra level check sum failed"); // # nocov
temp[thisAdd++] = s;
SET_TRUELENGTH(s,0);
hash_set(marks, s, 0);
}
}
savetl_end();
setAttrib(target, R_LevelsSymbol, targetLevels=growVector(targetLevels, nTargetLevels + nAdd));
for (int k=0; k<nAdd; ++k) {
SET_STRING_ELT(targetLevels, nTargetLevels+k, temp[k]);
}
free(temp);
} else {
// all source levels were already in target levels, but not with the same integers; we're done
savetl_end();
}
// now continue, but with the mapped integers in the (new) source
}
Expand Down Expand Up @@ -1271,62 +1256,6 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) {
return(v);
}

static SEXP *saveds=NULL;
static R_len_t *savedtl=NULL, nalloc=0, nsaved=0;

void savetl_init(void) {
if (nsaved || nalloc || saveds || savedtl) {
internal_error(__func__, _("savetl_init checks failed (%d %d %p %p)"), nsaved, nalloc, (void *)saveds, (void *)savedtl); // # nocov
}
nsaved = 0;
nalloc = 100;
saveds = (SEXP *)malloc(nalloc * sizeof(SEXP));
savedtl = (R_len_t *)malloc(nalloc * sizeof(R_len_t));
if (!saveds || !savedtl) {
free(saveds); free(savedtl); // # nocov
savetl_end(); // # nocov
error(_("Failed to allocate initial %d items in savetl_init"), nalloc); // # nocov
}
}

void savetl(SEXP s)
{
if (nsaved==nalloc) {
if (nalloc==INT_MAX) {
savetl_end(); // # nocov
internal_error(__func__, "reached maximum %d items for savetl", nalloc); // # nocov
}
nalloc = nalloc>(INT_MAX/2) ? INT_MAX : nalloc*2;
char *tmp = (char *)realloc(saveds, nalloc*sizeof(SEXP));
if (tmp==NULL) {
// C spec states that if realloc() fails the original block is left untouched; it is not freed or moved. We rely on that here.
savetl_end(); // # nocov free(saveds) happens inside savetl_end
error(_("Failed to realloc saveds to %d items in savetl"), nalloc); // # nocov
}
saveds = (SEXP *)tmp;
tmp = (char *)realloc(savedtl, nalloc*sizeof(R_len_t));
if (tmp==NULL) {
savetl_end(); // # nocov
error(_("Failed to realloc savedtl to %d items in savetl"), nalloc); // # nocov
}
savedtl = (R_len_t *)tmp;
}
saveds[nsaved] = s;
savedtl[nsaved] = TRUELENGTH(s);
nsaved++;
}

void savetl_end(void) {
// Can get called if nothing has been saved yet (nsaved==0), or even if _init() hasn't been called yet (pointers NULL). Such
// as to clear up before error. Also, it might be that nothing needed to be saved anyway.
for (int i=0; i<nsaved; i++) SET_TRUELENGTH(saveds[i],savedtl[i]);
free(saveds); // possible free(NULL) which is safe no-op
saveds = NULL;
free(savedtl);
savedtl = NULL;
nsaved = nalloc = 0;
}

SEXP setcharvec(SEXP x, SEXP which, SEXP newx)
{
int w;
Expand Down
45 changes: 10 additions & 35 deletions src/chmatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
}
// Since non-ASCII strings may be marked with different encodings, it only make sense to compare
// the bytes under a same encoding (UTF-8) #3844 #3850.
// Not 'const' because we might SET_TRUELENGTH() below.
SEXP *xd;
const SEXP *xd;
if (isSymbol(x)) {
xd = &sym;
} else {
Expand All @@ -56,33 +55,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
return ans;
}
// else xlen>1; nprotect is const above since no more R allocations should occur after this point
savetl_init();
for (int i=0; i<xlen; i++) {
SEXP s = xd[i];
const int tl = TRUELENGTH(s);
if (tl>0) {
savetl(s); // R's internal hash (which is positive); save it
SET_TRUELENGTH(s,0);
} else if (tl<0) {
// R 2.14.0+ initializes truelength to 0 (before that it was uninitialized/random).
// Now that data.table depends on R 3.1.0+, that is after 2.14.0 too.
// We rely on that 0-initialization, and that R's internal hash is positive.
// # nocov start
savetl_end();
internal_error(__func__, "CHARSXP '%s' has a negative truelength (%d)", CHAR(s), tl); // # nocov
// # nocov end
}
}
hashtab * marks = hash_create(tablelen);
int nuniq=0;
for (int i=0; i<tablelen; ++i) {
const SEXP s = td[i];
int tl = TRUELENGTH(s);
if (tl>0) { savetl(s); tl=0; }
if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
int tl = hash_lookup(marks, s, 0);
if (tl==0) hash_set(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
}
// in future if we need NAs in x not to be matched to NAs in table ...
// if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0)
// SET_TRUELENGTH(NA_STRING, 0);
if (chmatchdup) {
// chmatchdup() is basically base::pmatch() but without the partial matching part. For example :
// chmatchdup(c("a", "a"), c("a", "a")) # 1,2 - the second 'a' in 'x' has a 2nd match in 'table'
Expand All @@ -101,21 +81,19 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
if (!counts || !map) {
// # nocov start
free(counts); free(map);
for (int i=0; i<tablelen; i++) SET_TRUELENGTH(td[i], 0);
savetl_end();
error(_("Failed to allocate %"PRIu64" bytes working memory in chmatchdup: length(table)=%d length(unique(table))=%d"), ((uint64_t)tablelen*2+nuniq)*sizeof(int), tablelen, nuniq);
// # nocov end
}
for (int i=0; i<tablelen; ++i) counts[-TRUELENGTH(td[i])-1]++;
for (int i=0; i<tablelen; ++i) counts[-hash_lookup(marks, td[i], 0)-1]++;
for (int i=0, sum=0; i<nuniq; ++i) { int tt=counts[i]; counts[i]=sum; sum+=tt+1; }
for (int i=0; i<tablelen; ++i) map[counts[-TRUELENGTH(td[i])-1]++] = i+1; // 0 is left ending each group thanks to the calloc
for (int i=0; i<tablelen; ++i) map[counts[-hash_lookup(marks, td[i], 0)-1]++] = i+1; // 0 is left ending each group thanks to the calloc
for (int i=0, last=0; i<nuniq; ++i) {int tt=counts[i]+1; counts[i]=last; last=tt;} // rewind counts to the beginning of each group
for (int i=0; i<xlen; ++i) {
int u = TRUELENGTH(xd[i]);
int u = hash_lookup(marks, xd[i], 0);
if (u<0) {
const int w = counts[-u-1]++;
if (map[w]) { ansd[i]=map[w]; continue; }
SET_TRUELENGTH(xd[i],0); // w falls on ending 0 marker: dups used up; any more dups should return nomatch
hash_set(marks,xd[i],0); // w falls on ending 0 marker: dups used up; any more dups should return nomatch
// we still need the 0-setting loop at the end of this function because often there will be some values in table that are not matched to at all.
}
ansd[i] = nomatch;
Expand All @@ -124,17 +102,14 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
free(map);
} else if (chin) {
for (int i=0; i<xlen; i++) {
ansd[i] = TRUELENGTH(xd[i])<0;
ansd[i] = hash_lookup(marks,xd[i],0)<0;
}
} else {
for (int i=0; i<xlen; i++) {
const int m = TRUELENGTH(xd[i]);
const int m = hash_lookup(marks,xd[i],0);
ansd[i] = (m<0) ? -m : nomatch;
}
}
for (int i=0; i<tablelen; i++)
SET_TRUELENGTH(td[i], 0); // reinstate 0 rather than leave the -i-1
savetl_end();
UNPROTECT(nprotect); // ans, xd, td
return ans;
}
Expand Down
26 changes: 25 additions & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ SEXP growVector(SEXP x, R_len_t newlen);
SEXP allocNAVector(SEXPTYPE type, R_len_t n);
SEXP allocNAVectorLike(SEXP x, R_len_t n);
void writeNA(SEXP v, const int from, const int n, const bool listNA);
void savetl_init(void), savetl(SEXP s), savetl_end(void);
int checkOverAlloc(SEXP x);

// forder.c
Expand Down Expand Up @@ -280,6 +279,31 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env);
//negate.c
SEXP notchin(SEXP x, SEXP table);

// hash.c
typedef struct hash_tab hashtab;
// Allocate, initialise, and return a pointer to the new hash table.
// n is the maximal number of elements that will be inserted.
// Lower load factors lead to fewer collisions and faster lookups, but waste memory.
// May raise an R error if an allocation fails or a size is out of bounds.
// The table is temporary (allocated via R_alloc()) and will be unprotected upon return from the .Call().
// See vmaxget()/vmaxset() if you need to unprotect it manually.
hashtab * hash_create(size_t n);
// Inserts a new key-value pair into the hash, or overwrites an existing value.
// Will raise an R error if inserting more than n elements.
// Don't try to insert a null pointer, nothing good will come out of it.
void hash_set(hashtab *, SEXP key, R_xlen_t value);
// Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound.
R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound);

// The dynamically-allocated hash table has a public field for the R protection wrapper.
// Keep it PROTECTed while the table is in use.
typedef struct dhash_tab {
SEXP prot;
} dhashtab;
dhashtab * dhash_create(size_t n);
void dhash_set(dhashtab * h, SEXP key, R_xlen_t value);
R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound);

// functions called from R level .Call/.External and registered in init.c
// these now live here to pass -Wstrict-prototypes, #5477
// all arguments must be SEXP since they are called from R level
Expand Down
Loading
Loading