Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

memrecycle no snprintf overhead #5463

Merged
merged 7 commits into from
Oct 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,8 @@

14. The options `datatable.print.class` and `datatable.print.keys` are now `TRUE` by default. They have been available since v1.9.8 (Nov 2016) and v1.11.0 (May 2018) respectively.

15. Thanks to @ssh352, Václav Tlapák, Cole Miller, András Svraka and Toby Dylan Hocking for reporting and bisecting a significant performance regression in dev. This was fixed before release thanks to a PR by Jan Gorecki, [#5463](https://github.com/Rdatatable/data.table/pull/5463).


# data.table [v1.14.4](https://github.com/Rdatatable/data.table/milestone/26?closed=1)

Expand Down
130 changes: 67 additions & 63 deletions src/assign.c
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,12 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
#define MSGSIZE 1000
static char memrecycle_message[MSGSIZE+1]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects

const char *targetDesc(const int colnum, const char *colname) {
static char str[501]; // #5463
snprintf(str, 500, colnum==0 ? _("target vector") : _("column %d named '%s'"), colnum, colname);
return str;
}

const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname)
// like memcpy but recycles single-item source
// 'where' a 1-based INTEGER vector subset of target to assign to, or NULL or integer()
Expand All @@ -707,8 +713,6 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
if (colname==NULL)
error(_("Internal error: memrecycle has received NULL colname")); // # nocov
*memrecycle_message = '\0';
static char targetDesc[501]; // from 1.14.1 coerceAs reuses memrecycle for a target vector, PR#4491
snprintf(targetDesc, 500, colnum==0 ? _("target vector") : _("column %d named '%s'"), colnum, colname);
int protecti=0;
const bool sourceIsFactor=isFactor(source), targetIsFactor=isFactor(target);
const bool sourceIsI64=isReal(source) && INHERITS(source, char_integer64);
Expand All @@ -730,15 +734,15 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
for (int i=0; i<slen; ++i) {
const int val = sd[i+soff];
if ((val<1 && val!=NA_INTEGER) || val>nlevel) {
error(_("Assigning factor numbers to %s. But %d is outside the level range [1,%d]"), targetDesc, val, nlevel);
error(_("Assigning factor numbers to %s. But %d is outside the level range [1,%d]"), targetDesc(colnum, colname), val, nlevel);
}
}
} else {
const double *sd = REAL(source);
for (int i=0; i<slen; ++i) {
const double val = sd[i+soff];
if (!ISNAN(val) && (!R_FINITE(val) || val!=(int)val || (int)val<1 || (int)val>nlevel)) {
error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc, val, nlevel);
error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc(colnum, colname), val, nlevel);
}
}
}
Expand Down Expand Up @@ -830,47 +834,47 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
}
}
} else if (isString(source) && !isString(target) && !isNewList(target)) {
warning(_("Coercing 'character' RHS to '%s' to match the type of %s."), targetIsI64?"integer64":type2char(TYPEOF(target)), targetDesc);
warning(_("Coercing 'character' RHS to '%s' to match the type of %s."), targetIsI64?"integer64":type2char(TYPEOF(target)), targetDesc(colnum, colname));
// this "Coercing ..." warning first to give context in case coerceVector warns 'NAs introduced by coercion'
// and also because 'character' to integer/double coercion is often a user mistake (e.g. wrong target column, or wrong
// variable on RHS) which they are more likely to appreciate than find inconvenient
source = PROTECT(coerceVector(source, TYPEOF(target))); protecti++;
} else if (isNewList(source) && !isNewList(target)) {
if (targetIsI64) {
error(_("Cannot coerce 'list' RHS to 'integer64' to match the type of %s."), targetDesc);
error(_("Cannot coerce 'list' RHS to 'integer64' to match the type of %s."), targetDesc(colnum, colname));
// because R's coerceVector doesn't know about integer64
}
// as in base R; e.g. let as.double(list(1,2,3)) work but not as.double(list(1,c(2,4),3))
// relied on by NNS, simstudy and table.express; tests 1294.*
warning(_("Coercing 'list' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc);
warning(_("Coercing 'list' RHS to '%s' to match the type of %s."), type2char(TYPEOF(target)), targetDesc(colnum, colname));
source = PROTECT(coerceVector(source, TYPEOF(target))); protecti++;
} else if ((TYPEOF(target)!=TYPEOF(source) || targetIsI64!=sourceIsI64) && !isNewList(target)) {
if (GetVerbose()>=3) {
// only take the (small) cost of GetVerbose() (search of options() list) when types don't match
Rprintf(_("Zero-copy coerce when assigning '%s' to '%s' %s.\n"),
sourceIsI64 ? "integer64" : type2char(TYPEOF(source)),
targetIsI64 ? "integer64" : type2char(TYPEOF(target)),
targetDesc);
targetDesc(colnum, colname));
}
// The following checks are up front here, otherwise we'd need them twice in the two branches
// inside BODY that cater for 'where' or not. Maybe there's a way to merge the two macros in future.
// The idea is to do these range checks without calling coerceVector() (which allocates)

#define CHECK_RANGE(STYPE, RFUN, COND, FMT, TO, FMTVAL) {{ \
const STYPE *sd = (const STYPE *)RFUN(source); \
for (int i=0; i<slen; ++i) { \
const STYPE val = sd[i+soff]; \
if (COND) { \
const char *sType = sourceIsI64 ? "integer64" : type2char(TYPEOF(source)); \
const char *tType = targetIsI64 ? "integer64" : type2char(TYPEOF(target)); \
snprintf(memrecycle_message, MSGSIZE, \
"%"FMT" (type '%s') at RHS position %d "TO" when assigning to type '%s' (%s)", \
FMTVAL, sType, i+1, tType, targetDesc); \
/* string returned so that rbindlist/dogroups can prefix it with which item of its list this refers to */ \
break; \
} \
} \
} break; }
#define CHECK_RANGE(STYPE, RFUN, COND, FMT, TO, FMTVAL) {{ \
const STYPE *sd = (const STYPE *)RFUN(source); \
for (int i=0; i<slen; ++i) { \
const STYPE val = sd[i+soff]; \
if (COND) { \
const char *sType = sourceIsI64 ? "integer64" : type2char(TYPEOF(source)); \
const char *tType = targetIsI64 ? "integer64" : type2char(TYPEOF(target)); \
snprintf(memrecycle_message, MSGSIZE, \
"%"FMT" (type '%s') at RHS position %d "TO" when assigning to type '%s' (%s)", \
FMTVAL, sType, i+1, tType, targetDesc(colnum, colname)); \
/* string returned so that rbindlist/dogroups can prefix it with which item of its list this refers to */ \
break; \
} \
} \
} break; }

switch(TYPEOF(target)) {
case LGLSXP:
Expand Down Expand Up @@ -909,47 +913,47 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
}
}

#undef BODY
#define BODY(STYPE, RFUN, CTYPE, CAST, ASSIGN) {{ \
const STYPE *sd = (const STYPE *)RFUN(source); \
if (length(where)) { \
if (slen==1) { \
const STYPE val = sd[soff]; \
const CTYPE cval = CAST; \
for (int wi=0; wi<len; ++wi) { \
const int w = wd[wi]; \
if (w<1) continue; /*0 or NA*/ \
const int i = w-1; \
ASSIGN; \
} \
} else { \
for (int wi=0; wi<len; ++wi) { \
const int w = wd[wi]; \
if (w<1) continue; \
const STYPE val = sd[wi+soff]; \
const CTYPE cval = CAST; \
const int i = w-1; \
ASSIGN; \
} \
} \
} else { \
if (slen==1) { \
const STYPE val = sd[soff]; \
const CTYPE cval = CAST; \
for (int i=0; i<len; ++i) { \
ASSIGN; \
} \
} else { \
for (int i=0; i<len; i++) { \
const STYPE val = sd[i+soff]; \
const CTYPE cval = CAST; \
ASSIGN; \
} \
} \
} \
} break; }
#undef BODY
#define BODY(STYPE, RFUN, CTYPE, CAST, ASSIGN) {{ \
const STYPE *sd = (const STYPE *)RFUN(source); \
if (length(where)) { \
if (slen==1) { \
const STYPE val = sd[soff]; \
const CTYPE cval = CAST; \
for (int wi=0; wi<len; ++wi) { \
const int w = wd[wi]; \
if (w<1) continue; /*0 or NA*/ \
const int i = w-1; \
ASSIGN; \
} \
} else { \
for (int wi=0; wi<len; ++wi) { \
const int w = wd[wi]; \
if (w<1) continue; \
const STYPE val = sd[wi+soff]; \
const CTYPE cval = CAST; \
const int i = w-1; \
ASSIGN; \
} \
} \
} else { \
if (slen==1) { \
const STYPE val = sd[soff]; \
const CTYPE cval = CAST; \
for (int i=0; i<len; ++i) { \
ASSIGN; \
} \
} else { \
for (int i=0; i<len; i++) { \
const STYPE val = sd[i+soff]; \
const CTYPE cval = CAST; \
ASSIGN; \
} \
} \
} \
} break; }

#define COERCE_ERROR(targetType) error(_("type '%s' cannot be coerced to '%s'"), type2char(TYPEOF(source)), targetType); // 'targetType' for integer64 vs double
#define COERCE_ERROR(targetType) error(_("type '%s' cannot be coerced to '%s'"), type2char(TYPEOF(source)), targetType); // 'targetType' for integer64 vs double

const int off = length(where) ? 0 : start; // off = target offset; e.g. called from rbindlist with where=R_NilValue and start!=0
const bool mc = length(where)==0 && slen>0 && slen==len && soff==0; // mc=memcpy; only if types match and not for single items (a single assign faster than these non-const memcpy calls)
Expand Down