diff --git a/R/fwrite.R b/R/fwrite.R index 38ddf3a72..8d524b88a 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -1,6 +1,6 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", sep=",", eol=if (.Platform$OS.type=="windows") "\r\n" else "\n", - na="", col.names=TRUE, qmethod="double", verbose=FALSE, turbo=TRUE) { + na="", col.names=TRUE, qmethod="double", verbose=FALSE, ..turbo=TRUE) { isLOGICAL <- function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R? na = as.character(na[1L]) # fix for #1725 @@ -12,13 +12,14 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", length(qmethod) == 1L && qmethod %in% c("double", "escape"), isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(verbose), length(na) == 1L, #1725, handles NULL or character(0) input - isLOGICAL(turbo), + isLOGICAL(..turbo), is.character(file) && length(file)==1 && !is.na(file)) file <- path.expand(file) # "~/foo/bar" if (append && missing(col.names) && (file=="" || file.exists(file))) col.names = FALSE # test 1658.16 checks this + if (!..turbo) warning("The ..turbo=FALSE option will be removed in future. Please report any problems with ..turbo=TRUE.") if (verbose || file=="") old=setDTthreads(1) # console output isn't thread safe - .Call(Cwritefile, x, file, sep, eol, na, quote, qmethod == "escape", append, col.names, verbose, turbo) + .Call(Cwritefile, x, file, sep, eol, na, quote, qmethod == "escape", append, col.names, verbose, ..turbo) if (verbose) setDTthreads(old) invisible() } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d489f21a2..5759777d4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9422,6 +9422,57 @@ ans = c("V1","5.123456789e+300","-5.123456789e+300", # 0 11111110100 011001101011100100100011110110110000 01001110 01011101 test(1729.9, fwrite(DT), output=paste(ans,collapse="")) test(1729.11, write.csv(DT,row.names=FALSE,quote=FALSE), output=paste(ans,collapse="")) +DT = data.table(unlist(.Machine[c("double.eps","double.neg.eps","double.xmin","double.xmax")])) +# double.eps double.neg.eps double.xmin double.xmax +# 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308 +test(1729.12, typeof(DT[[1L]]), "double") +test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) + +if ("package:bit64" %in% search()) { + test(1730.1, typeof(-2147483647L), "integer") + test(1730.2, as.integer(-2147483648), NA_integer_, warning="NAs introduced by coercion to integer range") + test(1730.3, as.integer("-2147483647"), -2147483647L) + test(1730.4, as.integer("-2147483648"), NA_integer_, warning="NAs introduced by coercion to integer range") + test(1730.5, as.integer64("-2147483648"), as.integer64(-2147483648)) + test(1730.6, as.character((as.integer64(2^62)-1)*2+1), "9223372036854775807") + test(1730.7, as.character((as.integer64(2^62)-1)*2+2), NA_character_, warning="integer64 overflow") + test(1730.8, as.character(-(as.integer64(2^62)-1)*2-1), "-9223372036854775807") + test(1730.9, as.character(-(as.integer64(2^62)-1)*2-2), NA_character_, warning="integer64.*flow") + # Currently bit64 truncs to extremes in character coercion. Don't test that in case bit64 changes in future. + # as.integer64("-9223372036854775808") == NA + # as.integer64("-9223372036854775999") == NA + # as.integer64("+9223372036854775808") == 9223372036854775807 + # as.integer64("+9223372036854775999") == 9223372036854775807 + DT = data.table( as.integer64(c( + "-9223372036854775807", # integer64 min 2^63-1 + "+9223372036854775807", # integer64 max + "-9223372036854775806","+9223372036854775806", # 1 below extreme just to check + "0","-1","1", + "NA",NA, + "-2147483646", # 1 below extreme to check + "-2147483647", # smallest integer in R + "-2147483648", # NA_INTEGER == INT_MIN but valid integer64 + "-2147483649", + "+2147483646", # positives as well just in case + "+2147483647", + "+2147483648", + "+2147483649" + ))) + ans = c("V1","-9223372036854775807","9223372036854775807","-9223372036854775806","9223372036854775806", + "0","-1","1","__NA__","__NA__", + "-2147483646","-2147483647","-2147483648","-2147483649", + "2147483646","2147483647","2147483648","2147483649") + test(1731.1, class(DT[[1L]]), "integer64") + test(1731.2, fwrite(DT,na="__NA__"), output=paste(ans,collapse="")) + f = tempfile() + test(1731.3, fwrite(DT,f, na="__NA__",..turbo=FALSE), NULL, warning="turbo.*will be removed") + test(1731.4, readLines(f), ans) + unlink(f) + test(1731.5, write.csv(DT,na="__NA__",row.names=FALSE,quote=FALSE), output=paste(ans,collapse="")) + # write.csv works on integer64 because it calls bit64's as.character method +} else { + cat("Tests 1730 & 1731 not run. If required call library(bit64) first.\n") +} ########################## @@ -9457,7 +9508,8 @@ options(warn=0) setDTthreads(0) options(oldalloccol) # set at top of this file options(oldWhenJsymbol) -plat = paste("endian=",.Platform$endian,", sizeof(long double)==",.Machine$sizeof.longdouble,sep="") +plat = paste("endian=",.Platform$endian,", sizeof(long double)==",.Machine$sizeof.longdouble, + ", sizeof(pointer)==",.Machine$sizeof.pointer, sep="") if (nfail > 0) { if (nfail>1) {s1="s";s2="s: "} else {s1="";s2=" "} cat("\r") diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 6d44549b7..b19d65596 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -8,7 +8,7 @@ As \code{write.csv} and but \emph{much} faster (e.g. 1 minute versus 2 seconds) fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",", eol = if (.Platform$OS.type=="windows") "\r\n" else "\n", na = "", col.names = TRUE, qmethod = "double", - verbose=FALSE, turbo=TRUE) + verbose=FALSE, ..turbo=TRUE) } \arguments{ \item{x}{A \code{data.table} or \code{data.frame} to write.} @@ -25,18 +25,18 @@ fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",", \item{"double" (default), in which case it is doubled.} }} \item{verbose}{Be chatty and report timings?} - \item{turbo}{Use specialized custom C code to format numeric and integer columns. This reduces call overhead to the C library and avoids any use of memory buffers (copies) at all. Try with and without to see the difference it makes on your machine and please report any significant differences in output.} + \item{..turbo}{Use specialized custom C code to format numeric, integer and integer64 columns. This reduces call overhead to the C library and avoids any use of memory buffers (copies) at all. Try with and without to see the difference it makes on your machine and please report any significant differences in output. If you do find cases where \code{..turbo=FALSE} is needed, please report them as bugs, since this option WILL BE REMOVED in future. Hence why it has the \code{__} prefix.} } \details{ \code{fwrite} began as a community contribution with a \href{https://github.com/Rdatatable/data.table/pull/1613}{Pull Request PR#1613} by Otto Seiskari. Following that, Matt worked on reducing time spent on I/O with buffered write. This resulted in further speed enhancements. Since those improvements resulted in the time spent almost entirely on formatting, it meant that we could benefit a lot from parallelisation. This was also done, which improved performance even further (YMMV depending on the number of cores / threads per core in one's machine). -Finally, with \code{turbo = TRUE}, the time spent on formatting (through calls to C libraries) itself was reduced by implementing native C-code for writing \code{integer} and \code{numeric} types. +Finally, with \code{..turbo = TRUE}, the time spent on formatting (through calls to C libraries) itself was reduced by implementing native C-code for writing \code{integer} and \code{numeric} types. The logic for formatting \code{integer} columns is simpler (= faster) than \code{numeric} types. Therefore columns stored as \code{numeric} types, but in reality are \code{integers} are identified and formatted using integer logic for further improvement. -With this, writing a \code{data.table} of approximately 23 million rows and 19 columns (~2.85GB on disk) takes ~5.9s with \code{turbo = TRUE} and ~20s with \code{turbo = FALSE} on a 13' Macbook Pro with 512GB SSD and an i7 processor with 2 cores containing one thread per core (and two virtual threads via hyperthreading). +With this, writing a \code{data.table} of approximately 23 million rows and 19 columns (~2.85GB on disk) takes ~5.9s with \code{..turbo = TRUE} and ~20s with \code{..turbo = FALSE} on a 13' Macbook Pro with 512GB SSD and an i7 processor with 2 cores containing one thread per core (and two virtual threads via hyperthreading). } \seealso{ diff --git a/src/fwrite.c b/src/fwrite.c index 9755b5451..7a6b2f351 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -43,15 +43,16 @@ static inline int maxStrLen(SEXP x, int na_len) { #define DECIMAL_SEP '.' // TODO allow other decimal separator e.g. ',' // Globals for this file only (written once to hold parameters passed from R level) -static int na_len; +static size_t na_len; static const char *na_str; -static inline void writeInteger(int x, char **thisCh) +static inline void writeInteger(long long x, char **thisCh) { char *ch = *thisCh; - if (x == NA_INTEGER) { - if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; } - } else if (x == 0) { + // both integer and integer64 are passed to this function so careful + // to test for NA_INTEGER in the calling code. INT_MIN (NA_INTEGER) is + // a valid non-NA in integer64 + if (x == 0) { *ch++ = '0'; } else { if (x<0) { *ch++ = '-'; x=-x; } @@ -126,7 +127,7 @@ static inline void writeNumeric(double x, char **thisCh) char *ch = *thisCh; if (!R_FINITE(x)) { if (ISNAN(x)) { - if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; } // by default na_len==0 and the memcpy call will be skipped + memcpy(ch, na_str, na_len); ch += na_len; // by default na_len==0 and the memcpy call will be skipped } else if (x>0) { *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f'; } else { @@ -236,6 +237,16 @@ static inline void writeNumeric(double x, char **thisCh) *thisCh = ch; } +inline Rboolean isInteger64(SEXP x) { + SEXP class = getAttrib(x, R_ClassSymbol); + if (isString(class)) { + for (int i=0; i nrows) upp = nrows; if (turbo && sameType == REALSXP) { - // avoid deep switch. turbo switches on both sameType and specialized writeNumeric + // avoid deep switch() on type. turbo switches on both sameType and specialized writeNumeric for (RLEN row_i = start_row; row_i < upp; row_i++) { for (int col_i = 0; col_i < ncols; col_i++) { SEXP column = VECTOR_ELT(list_of_columns, col_i); @@ -419,7 +435,11 @@ SEXP writefile(SEXP list_of_columns, for (RLEN row_i = start_row; row_i < upp; row_i++) { for (int col_i = 0; col_i < ncols; col_i++) { SEXP column = VECTOR_ELT(list_of_columns, col_i); - writeInteger(INTEGER(column)[row_i], &ch); + if (INTEGER(column)[row_i] == NA_INTEGER) { + memcpy(ch, na_str, na_len); ch += na_len; + } else { + writeInteger(INTEGER(column)[row_i], &ch); + } *ch++ = col_sep; } ch--; @@ -427,15 +447,16 @@ SEXP writefile(SEXP list_of_columns, ch += row_sep_len; } } else { + // mixed types. switch() on every cell value since must write row-by-row for (RLEN row_i = start_row; row_i < upp; row_i++) { for (int col_i = 0; col_i < ncols; col_i++) { SEXP column = VECTOR_ELT(list_of_columns, col_i); - SEXP str; + SEXP str; // no declare within switch() allowed by C, otherwise would do switch(TYPEOF(column)) { case LGLSXP: true_false = LOGICAL(column)[row_i]; if (true_false == NA_LOGICAL) { - if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; } + memcpy(ch, na_str, na_len); ch += na_len; } else if (true_false) { memcpy(ch,"TRUE",4); // Other than strings, field widths are limited which we check elsewhere here to ensure ch += 4; @@ -445,22 +466,33 @@ SEXP writefile(SEXP list_of_columns, } break; case REALSXP: - if (ISNA(REAL(column)[row_i])) { - if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; } + if (integer64[col_i]) { + long long i64 = *(long long *)&REAL(column)[row_i]; + if (i64 == NAINT64) { + memcpy(ch, na_str, na_len); ch += na_len; + } else { + if (turbo) { + writeInteger(i64, &ch); + } else { + ch += sprintf(ch, "%lld", i64); + } + } } else { if (turbo) { - // if there are any problems with the hand rolled double writing, then turbo=FALSE reverts to standard library - writeNumeric(REAL(column)[row_i], &ch); + writeNumeric(REAL(column)[row_i], &ch); // handles NA, Inf etc within it } else { - //tt0 = clock(); - ch += sprintf(ch, "%.15G", REAL(column)[row_i]); - //tNUM += clock()-tt0; + // if there are any problems with the specialized writeNumeric, user can revert to (slower) standard library + if (ISNAN(REAL(column)[row_i])) { + memcpy(ch, na_str, na_len); ch += na_len; + } else { + ch += sprintf(ch, "%.15g", REAL(column)[row_i]); + } } } break; case INTSXP: if (INTEGER(column)[row_i] == NA_INTEGER) { - if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; } + memcpy(ch, na_str, na_len); ch += na_len; } else if (levels[col_i] != NULL) { // isFactor(column) == TRUE str = STRING_ELT(levels[col_i], INTEGER(column)[row_i]-1); if (quote) { @@ -480,7 +512,7 @@ SEXP writefile(SEXP list_of_columns, case STRSXP: str = STRING_ELT(column, row_i); if (str==NA_STRING) { - if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; } + memcpy(ch, na_str, na_len); ch += na_len; } else if (quote) { QUOTE_FIELD; } else {