Skip to content

Commit

Permalink
fwrite integer64 implemented with tests. turbo renamed to ..turbo wit…
Browse files Browse the repository at this point in the history
…h warning that it will be removed.
  • Loading branch information
mattdowle committed Nov 2, 2016
1 parent 4a1f4ba commit 6d55d2f
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 30 deletions.
7 changes: 4 additions & 3 deletions R/fwrite.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
fwrite <- function(x, file="", append=FALSE, quote="auto",
sep=",", eol=if (.Platform$OS.type=="windows") "\r\n" else "\n",
na="", col.names=TRUE, qmethod="double", verbose=FALSE, turbo=TRUE) {
na="", col.names=TRUE, qmethod="double", verbose=FALSE, ..turbo=TRUE) {

isLOGICAL <- function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R?
na = as.character(na[1L]) # fix for #1725
Expand All @@ -12,13 +12,14 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
length(qmethod) == 1L && qmethod %in% c("double", "escape"),
isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(verbose),
length(na) == 1L, #1725, handles NULL or character(0) input
isLOGICAL(turbo),
isLOGICAL(..turbo),
is.character(file) && length(file)==1 && !is.na(file))
file <- path.expand(file) # "~/foo/bar"
if (append && missing(col.names) && (file=="" || file.exists(file)))
col.names = FALSE # test 1658.16 checks this
if (!..turbo) warning("The ..turbo=FALSE option will be removed in future. Please report any problems with ..turbo=TRUE.")
if (verbose || file=="") old=setDTthreads(1) # console output isn't thread safe
.Call(Cwritefile, x, file, sep, eol, na, quote, qmethod == "escape", append, col.names, verbose, turbo)
.Call(Cwritefile, x, file, sep, eol, na, quote, qmethod == "escape", append, col.names, verbose, ..turbo)
if (verbose) setDTthreads(old)
invisible()
}
Expand Down
54 changes: 53 additions & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -9422,6 +9422,57 @@ ans = c("V1","5.123456789e+300","-5.123456789e+300",
# 0 11111110100 011001101011100100100011110110110000 01001110 01011101
test(1729.9, fwrite(DT), output=paste(ans,collapse=""))
test(1729.11, write.csv(DT,row.names=FALSE,quote=FALSE), output=paste(ans,collapse=""))
DT = data.table(unlist(.Machine[c("double.eps","double.neg.eps","double.xmin","double.xmax")]))
# double.eps double.neg.eps double.xmin double.xmax
# 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308
test(1729.12, typeof(DT[[1L]]), "double")
test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)))

if ("package:bit64" %in% search()) {
test(1730.1, typeof(-2147483647L), "integer")
test(1730.2, as.integer(-2147483648), NA_integer_, warning="NAs introduced by coercion to integer range")
test(1730.3, as.integer("-2147483647"), -2147483647L)
test(1730.4, as.integer("-2147483648"), NA_integer_, warning="NAs introduced by coercion to integer range")
test(1730.5, as.integer64("-2147483648"), as.integer64(-2147483648))
test(1730.6, as.character((as.integer64(2^62)-1)*2+1), "9223372036854775807")
test(1730.7, as.character((as.integer64(2^62)-1)*2+2), NA_character_, warning="integer64 overflow")
test(1730.8, as.character(-(as.integer64(2^62)-1)*2-1), "-9223372036854775807")
test(1730.9, as.character(-(as.integer64(2^62)-1)*2-2), NA_character_, warning="integer64.*flow")
# Currently bit64 truncs to extremes in character coercion. Don't test that in case bit64 changes in future.
# as.integer64("-9223372036854775808") == NA
# as.integer64("-9223372036854775999") == NA
# as.integer64("+9223372036854775808") == 9223372036854775807
# as.integer64("+9223372036854775999") == 9223372036854775807
DT = data.table( as.integer64(c(
"-9223372036854775807", # integer64 min 2^63-1
"+9223372036854775807", # integer64 max
"-9223372036854775806","+9223372036854775806", # 1 below extreme just to check
"0","-1","1",
"NA",NA,
"-2147483646", # 1 below extreme to check
"-2147483647", # smallest integer in R
"-2147483648", # NA_INTEGER == INT_MIN but valid integer64
"-2147483649",
"+2147483646", # positives as well just in case
"+2147483647",
"+2147483648",
"+2147483649"
)))
ans = c("V1","-9223372036854775807","9223372036854775807","-9223372036854775806","9223372036854775806",
"0","-1","1","__NA__","__NA__",
"-2147483646","-2147483647","-2147483648","-2147483649",
"2147483646","2147483647","2147483648","2147483649")
test(1731.1, class(DT[[1L]]), "integer64")
test(1731.2, fwrite(DT,na="__NA__"), output=paste(ans,collapse=""))
f = tempfile()
test(1731.3, fwrite(DT,f, na="__NA__",..turbo=FALSE), NULL, warning="turbo.*will be removed")
test(1731.4, readLines(f), ans)
unlink(f)
test(1731.5, write.csv(DT,na="__NA__",row.names=FALSE,quote=FALSE), output=paste(ans,collapse=""))
# write.csv works on integer64 because it calls bit64's as.character method
} else {
cat("Tests 1730 & 1731 not run. If required call library(bit64) first.\n")
}


##########################
Expand Down Expand Up @@ -9457,7 +9508,8 @@ options(warn=0)
setDTthreads(0)
options(oldalloccol) # set at top of this file
options(oldWhenJsymbol)
plat = paste("endian=",.Platform$endian,", sizeof(long double)==",.Machine$sizeof.longdouble,sep="")
plat = paste("endian=",.Platform$endian,", sizeof(long double)==",.Machine$sizeof.longdouble,
", sizeof(pointer)==",.Machine$sizeof.pointer, sep="")
if (nfail > 0) {
if (nfail>1) {s1="s";s2="s: "} else {s1="";s2=" "}
cat("\r")
Expand Down
8 changes: 4 additions & 4 deletions man/fwrite.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ As \code{write.csv} and but \emph{much} faster (e.g. 1 minute versus 2 seconds)
fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",",
eol = if (.Platform$OS.type=="windows") "\r\n" else "\n",
na = "", col.names = TRUE, qmethod = "double",
verbose=FALSE, turbo=TRUE)
verbose=FALSE, ..turbo=TRUE)
}
\arguments{
\item{x}{A \code{data.table} or \code{data.frame} to write.}
Expand All @@ -25,18 +25,18 @@ fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",",
\item{"double" (default), in which case it is doubled.}
}}
\item{verbose}{Be chatty and report timings?}
\item{turbo}{Use specialized custom C code to format numeric and integer columns. This reduces call overhead to the C library and avoids any use of memory buffers (copies) at all. Try with and without to see the difference it makes on your machine and please report any significant differences in output.}
\item{..turbo}{Use specialized custom C code to format numeric, integer and integer64 columns. This reduces call overhead to the C library and avoids any use of memory buffers (copies) at all. Try with and without to see the difference it makes on your machine and please report any significant differences in output. If you do find cases where \code{..turbo=FALSE} is needed, please report them as bugs, since this option WILL BE REMOVED in future. Hence why it has the \code{__} prefix.}

This comment has been minimized.

Copy link
@MichaelChirico

MichaelChirico Nov 2, 2016

Member

says __ prefix but prefix is ..

}
\details{
\code{fwrite} began as a community contribution with a \href{https://github.com/Rdatatable/data.table/pull/1613}{Pull Request PR#1613} by Otto Seiskari. Following that, Matt worked on reducing time spent on I/O with buffered write. This resulted in further speed enhancements.

Since those improvements resulted in the time spent almost entirely on formatting, it meant that we could benefit a lot from parallelisation. This was also done, which improved performance even further (YMMV depending on the number of cores / threads per core in one's machine).
Finally, with \code{turbo = TRUE}, the time spent on formatting (through calls to C libraries) itself was reduced by implementing native C-code for writing \code{integer} and \code{numeric} types.
Finally, with \code{..turbo = TRUE}, the time spent on formatting (through calls to C libraries) itself was reduced by implementing native C-code for writing \code{integer} and \code{numeric} types.
The logic for formatting \code{integer} columns is simpler (= faster) than \code{numeric} types. Therefore columns stored as \code{numeric} types, but in reality are \code{integers} are identified and formatted using integer logic for further improvement.
With this, writing a \code{data.table} of approximately 23 million rows and 19 columns (~2.85GB on disk) takes ~5.9s with \code{turbo = TRUE} and ~20s with \code{turbo = FALSE} on a 13' Macbook Pro with 512GB SSD and an i7 processor with 2 cores containing one thread per core (and two virtual threads via hyperthreading).
With this, writing a \code{data.table} of approximately 23 million rows and 19 columns (~2.85GB on disk) takes ~5.9s with \code{..turbo = TRUE} and ~20s with \code{..turbo = FALSE} on a 13' Macbook Pro with 512GB SSD and an i7 processor with 2 cores containing one thread per core (and two virtual threads via hyperthreading).

}
\seealso{
Expand Down
76 changes: 54 additions & 22 deletions src/fwrite.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,16 @@ static inline int maxStrLen(SEXP x, int na_len) {
#define DECIMAL_SEP '.' // TODO allow other decimal separator e.g. ','

// Globals for this file only (written once to hold parameters passed from R level)
static int na_len;
static size_t na_len;
static const char *na_str;

static inline void writeInteger(int x, char **thisCh)
static inline void writeInteger(long long x, char **thisCh)
{
char *ch = *thisCh;
if (x == NA_INTEGER) {
if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
} else if (x == 0) {
// both integer and integer64 are passed to this function so careful
// to test for NA_INTEGER in the calling code. INT_MIN (NA_INTEGER) is
// a valid non-NA in integer64
if (x == 0) {
*ch++ = '0';
} else {
if (x<0) { *ch++ = '-'; x=-x; }
Expand Down Expand Up @@ -126,7 +127,7 @@ static inline void writeNumeric(double x, char **thisCh)
char *ch = *thisCh;
if (!R_FINITE(x)) {
if (ISNAN(x)) {
if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; } // by default na_len==0 and the memcpy call will be skipped
memcpy(ch, na_str, na_len); ch += na_len; // by default na_len==0 and the memcpy call will be skipped
} else if (x>0) {
*ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f';
} else {
Expand Down Expand Up @@ -236,6 +237,16 @@ static inline void writeNumeric(double x, char **thisCh)
*thisCh = ch;
}

inline Rboolean isInteger64(SEXP x) {
SEXP class = getAttrib(x, R_ClassSymbol);
if (isString(class)) {
for (int i=0; i<LENGTH(class); i++) { // inherits()
if (STRING_ELT(class, i) == char_integer64) return TRUE;
}
}
return FALSE;
}

SEXP writefile(SEXP list_of_columns,
SEXP filenameArg,
SEXP col_sep_Arg,
Expand Down Expand Up @@ -302,15 +313,17 @@ SEXP writefile(SEXP list_of_columns,
SEXP levels[ncols]; // on-stack vla
int lineLenMax = 2; // initialize with eol max width of \r\n on windows
int sameType = TYPEOF(VECTOR_ELT(list_of_columns, 0));
Rboolean integer64[ncols]; // store result of isInteger64() per column for efficiency
for (int col_i=0; col_i<ncols; col_i++) {
SEXP column = VECTOR_ELT(list_of_columns, col_i);
if (TYPEOF(column) != sameType) sameType = 0;
integer64[col_i] = FALSE;
switch(TYPEOF(column)) {
case LGLSXP:
lineLenMax+=5; // width of FALSE
break;
case REALSXP:
lineLenMax+=25; // +- 15digits dec e +- nnn = 22 + 3 safety = 25
integer64[col_i] = isInteger64(column);
lineLenMax+=25; // +- 15digits dec e +- nnn = 22 + 3 safety = 25. That covers int64 too (20 digits).
break;
case INTSXP:
if (isFactor(column)) {
Expand All @@ -329,6 +342,9 @@ SEXP writefile(SEXP list_of_columns,
default:
error("Column %d's type is '%s' - not yet implemented.", col_i+1,type2char(TYPEOF(column)) );
}
if (TYPEOF(column) != sameType || integer64[col_i]) sameType = 0;
// we could code up all-integer64 case below as well but that seems even less
// likely in practice than all-int or all-double
lineLenMax++; // column separator
}
clock_t tlineLenMax=clock()-t0;
Expand All @@ -351,7 +367,7 @@ SEXP writefile(SEXP list_of_columns,
for (int col_i=0; col_i<ncols; col_i++) {
SEXP str = STRING_ELT(names, col_i);
if (str==NA_STRING) {
if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
memcpy(ch, na_str, na_len); ch += na_len;
break;
}
if (quote) {
Expand Down Expand Up @@ -404,7 +420,7 @@ SEXP writefile(SEXP list_of_columns,
int upp = start_row + rowsPerBatch;
if (upp > nrows) upp = nrows;
if (turbo && sameType == REALSXP) {
// avoid deep switch. turbo switches on both sameType and specialized writeNumeric
// avoid deep switch() on type. turbo switches on both sameType and specialized writeNumeric
for (RLEN row_i = start_row; row_i < upp; row_i++) {
for (int col_i = 0; col_i < ncols; col_i++) {
SEXP column = VECTOR_ELT(list_of_columns, col_i);
Expand All @@ -419,23 +435,28 @@ SEXP writefile(SEXP list_of_columns,
for (RLEN row_i = start_row; row_i < upp; row_i++) {
for (int col_i = 0; col_i < ncols; col_i++) {
SEXP column = VECTOR_ELT(list_of_columns, col_i);
writeInteger(INTEGER(column)[row_i], &ch);
if (INTEGER(column)[row_i] == NA_INTEGER) {
memcpy(ch, na_str, na_len); ch += na_len;
} else {
writeInteger(INTEGER(column)[row_i], &ch);
}
*ch++ = col_sep;
}
ch--;
memcpy(ch, row_sep, row_sep_len);
ch += row_sep_len;
}
} else {
// mixed types. switch() on every cell value since must write row-by-row
for (RLEN row_i = start_row; row_i < upp; row_i++) {
for (int col_i = 0; col_i < ncols; col_i++) {
SEXP column = VECTOR_ELT(list_of_columns, col_i);
SEXP str;
SEXP str; // no declare within switch() allowed by C, otherwise would do
switch(TYPEOF(column)) {
case LGLSXP:
true_false = LOGICAL(column)[row_i];
if (true_false == NA_LOGICAL) {
if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
memcpy(ch, na_str, na_len); ch += na_len;
} else if (true_false) {
memcpy(ch,"TRUE",4); // Other than strings, field widths are limited which we check elsewhere here to ensure
ch += 4;
Expand All @@ -445,22 +466,33 @@ SEXP writefile(SEXP list_of_columns,
}
break;
case REALSXP:
if (ISNA(REAL(column)[row_i])) {
if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
if (integer64[col_i]) {
long long i64 = *(long long *)&REAL(column)[row_i];
if (i64 == NAINT64) {
memcpy(ch, na_str, na_len); ch += na_len;
} else {
if (turbo) {
writeInteger(i64, &ch);
} else {
ch += sprintf(ch, "%lld", i64);
}
}
} else {
if (turbo) {
// if there are any problems with the hand rolled double writing, then turbo=FALSE reverts to standard library
writeNumeric(REAL(column)[row_i], &ch);
writeNumeric(REAL(column)[row_i], &ch); // handles NA, Inf etc within it
} else {
//tt0 = clock();
ch += sprintf(ch, "%.15G", REAL(column)[row_i]);
//tNUM += clock()-tt0;
// if there are any problems with the specialized writeNumeric, user can revert to (slower) standard library
if (ISNAN(REAL(column)[row_i])) {
memcpy(ch, na_str, na_len); ch += na_len;
} else {
ch += sprintf(ch, "%.15g", REAL(column)[row_i]);
}
}
}
break;
case INTSXP:
if (INTEGER(column)[row_i] == NA_INTEGER) {
if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
memcpy(ch, na_str, na_len); ch += na_len;
} else if (levels[col_i] != NULL) { // isFactor(column) == TRUE
str = STRING_ELT(levels[col_i], INTEGER(column)[row_i]-1);
if (quote) {
Expand All @@ -480,7 +512,7 @@ SEXP writefile(SEXP list_of_columns,
case STRSXP:
str = STRING_ELT(column, row_i);
if (str==NA_STRING) {
if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
memcpy(ch, na_str, na_len); ch += na_len;
} else if (quote) {
QUOTE_FIELD;
} else {
Expand Down

0 comments on commit 6d55d2f

Please sign in to comment.