fwrite integer64 implemented with tests. turbo renamed to ..turbo wit…

…h warning that it will be removed.
Rdatatable · Nov 2, 2016 · 6d55d2f · MichaelChirico · Nov 2, 2016 · 6d55d2f
1 parent 4a1f4ba
commit 6d55d2f
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 30 deletions.
diff --git a/R/fwrite.R b/R/fwrite.R
@@ -1,6 +1,6 @@
 fwrite <- function(x, file="", append=FALSE, quote="auto",
  sep=",", eol=if (.Platform$OS.type=="windows") "\r\n" else "\n",
- na="", col.names=TRUE, qmethod="double", verbose=FALSE, turbo=TRUE) {
+ na="", col.names=TRUE, qmethod="double", verbose=FALSE, ..turbo=TRUE) {
 
  isLOGICAL <- function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R?
  na = as.character(na[1L]) # fix for #1725
@@ -12,13 +12,14 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
  length(qmethod) == 1L && qmethod %in% c("double", "escape"), 
  isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(verbose), 
  length(na) == 1L, #1725, handles NULL or character(0) input
- isLOGICAL(turbo),
+ isLOGICAL(..turbo),
  is.character(file) && length(file)==1 && !is.na(file))
  file <- path.expand(file) # "~/foo/bar"
  if (append && missing(col.names) && (file=="" || file.exists(file)))
  col.names = FALSE # test 1658.16 checks this
+ if (!..turbo) warning("The ..turbo=FALSE option will be removed in future. Please report any problems with ..turbo=TRUE.")
  if (verbose || file=="") old=setDTthreads(1) # console output isn't thread safe 
- .Call(Cwritefile, x, file, sep, eol, na, quote, qmethod == "escape", append, col.names, verbose, turbo)
+ .Call(Cwritefile, x, file, sep, eol, na, quote, qmethod == "escape", append, col.names, verbose, ..turbo)
  if (verbose) setDTthreads(old)
  invisible()
 }

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -9422,6 +9422,57 @@ ans = c("V1","5.123456789e+300","-5.123456789e+300",
 # 0 11111110100 011001101011100100100011110110110000 01001110 01011101
 test(1729.9, fwrite(DT), output=paste(ans,collapse=""))
 test(1729.11, write.csv(DT,row.names=FALSE,quote=FALSE), output=paste(ans,collapse=""))
+DT = data.table(unlist(.Machine[c("double.eps","double.neg.eps","double.xmin","double.xmax")]))
+# double.eps double.neg.eps double.xmin double.xmax
+# 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308
+test(1729.12, typeof(DT[[1L]]), "double")
+test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)))
+
+if ("package:bit64" %in% search()) {
+ test(1730.1, typeof(-2147483647L), "integer")
+ test(1730.2, as.integer(-2147483648), NA_integer_, warning="NAs introduced by coercion to integer range")
+ test(1730.3, as.integer("-2147483647"), -2147483647L)
+ test(1730.4, as.integer("-2147483648"), NA_integer_, warning="NAs introduced by coercion to integer range")
+ test(1730.5, as.integer64("-2147483648"), as.integer64(-2147483648))
+ test(1730.6, as.character((as.integer64(2^62)-1)*2+1), "9223372036854775807")
+ test(1730.7, as.character((as.integer64(2^62)-1)*2+2), NA_character_, warning="integer64 overflow")
+ test(1730.8, as.character(-(as.integer64(2^62)-1)*2-1), "-9223372036854775807")
+ test(1730.9, as.character(-(as.integer64(2^62)-1)*2-2), NA_character_, warning="integer64.*flow")
+ # Currently bit64 truncs to extremes in character coercion. Don't test that in case bit64 changes in future.
+ # as.integer64("-9223372036854775808") == NA
+ # as.integer64("-9223372036854775999") == NA
+ # as.integer64("+9223372036854775808") == 9223372036854775807
+ # as.integer64("+9223372036854775999") == 9223372036854775807 
+ DT = data.table( as.integer64(c(
+ "-9223372036854775807", # integer64 min 2^63-1
+ "+9223372036854775807", # integer64 max
+ "-9223372036854775806","+9223372036854775806", # 1 below extreme just to check
+ "0","-1","1",
+ "NA",NA,
+ "-2147483646", # 1 below extreme to check
+ "-2147483647", # smallest integer in R
+ "-2147483648", # NA_INTEGER == INT_MIN but valid integer64
+ "-2147483649",
+ "+2147483646", # positives as well just in case
+ "+2147483647",
+ "+2147483648",
+ "+2147483649"
+ )))
+ ans = c("V1","-9223372036854775807","9223372036854775807","-9223372036854775806","9223372036854775806",
+ "0","-1","1","__NA__","__NA__",
+ "-2147483646","-2147483647","-2147483648","-2147483649",
+ "2147483646","2147483647","2147483648","2147483649")
+ test(1731.1, class(DT[[1L]]), "integer64")
+ test(1731.2, fwrite(DT,na="__NA__"), output=paste(ans,collapse=""))
+ f = tempfile()
+ test(1731.3, fwrite(DT,f, na="__NA__",..turbo=FALSE), NULL, warning="turbo.*will be removed")
+ test(1731.4, readLines(f), ans)
+ unlink(f)
+ test(1731.5, write.csv(DT,na="__NA__",row.names=FALSE,quote=FALSE), output=paste(ans,collapse=""))
+ # write.csv works on integer64 because it calls bit64's as.character method
+} else {
+ cat("Tests 1730 & 1731 not run. If required call library(bit64) first.\n")
+}
 
 
 ##########################
@@ -9457,7 +9508,8 @@ options(warn=0)
 setDTthreads(0)
 options(oldalloccol) # set at top of this file
 options(oldWhenJsymbol)
-plat = paste("endian=",.Platform$endian,", sizeof(long double)==",.Machine$sizeof.longdouble,sep="")
+plat = paste("endian=",.Platform$endian,", sizeof(long double)==",.Machine$sizeof.longdouble,
+ ", sizeof(pointer)==",.Machine$sizeof.pointer, sep="")
 if (nfail > 0) {
  if (nfail>1) {s1="s";s2="s: "} else {s1="";s2=" "}
  cat("\r")

diff --git a/man/fwrite.Rd b/man/fwrite.Rd
@@ -8,7 +8,7 @@ As \code{write.csv} and but \emph{much} faster (e.g. 1 minute versus 2 seconds)
 fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",",
  eol = if (.Platform$OS.type=="windows") "\r\n" else "\n",
  na = "", col.names = TRUE, qmethod = "double",
- verbose=FALSE, turbo=TRUE)
+ verbose=FALSE, ..turbo=TRUE)
 }
 \arguments{
  \item{x}{A \code{data.table} or \code{data.frame} to write.}
@@ -25,18 +25,18 @@ fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",",
  \item{"double" (default), in which case it is doubled.}
  }}
  \item{verbose}{Be chatty and report timings?}
- \item{turbo}{Use specialized custom C code to format numeric and integer columns. This reduces call overhead to the C library and avoids any use of memory buffers (copies) at all. Try with and without to see the difference it makes on your machine and please report any significant differences in output.}
+ \item{..turbo}{Use specialized custom C code to format numeric, integer and integer64 columns. This reduces call overhead to the C library and avoids any use of memory buffers (copies) at all. Try with and without to see the difference it makes on your machine and please report any significant differences in output. If you do find cases where \code{..turbo=FALSE} is needed, please report them as bugs, since this option WILL BE REMOVED in future. Hence why it has the \code{__} prefix.}
 }
 \details{
 \code{fwrite} began as a community contribution with a \href{https://github.com/Rdatatable/data.table/pull/1613}{Pull Request PR#1613} by Otto Seiskari. Following that, Matt worked on reducing time spent on I/O with buffered write. This resulted in further speed enhancements.
 
 Since those improvements resulted in the time spent almost entirely on formatting, it meant that we could benefit a lot from parallelisation. This was also done, which improved performance even further (YMMV depending on the number of cores / threads per core in one's machine).
 
-Finally, with \code{turbo = TRUE}, the time spent on formatting (through calls to C libraries) itself was reduced by implementing native C-code for writing \code{integer} and \code{numeric} types.
+Finally, with \code{..turbo = TRUE}, the time spent on formatting (through calls to C libraries) itself was reduced by implementing native C-code for writing \code{integer} and \code{numeric} types.
 
 The logic for formatting \code{integer} columns is simpler (= faster) than \code{numeric} types. Therefore columns stored as \code{numeric} types, but in reality are \code{integers} are identified and formatted using integer logic for further improvement.
 
-With this, writing a \code{data.table} of approximately 23 million rows and 19 columns (~2.85GB on disk) takes ~5.9s with \code{turbo = TRUE} and ~20s with \code{turbo = FALSE} on a 13' Macbook Pro with 512GB SSD and an i7 processor with 2 cores containing one thread per core (and two virtual threads via hyperthreading).
+With this, writing a \code{data.table} of approximately 23 million rows and 19 columns (~2.85GB on disk) takes ~5.9s with \code{..turbo = TRUE} and ~20s with \code{..turbo = FALSE} on a 13' Macbook Pro with 512GB SSD and an i7 processor with 2 cores containing one thread per core (and two virtual threads via hyperthreading).
 
 }
 \seealso{

diff --git a/src/fwrite.c b/src/fwrite.c
@@ -43,15 +43,16 @@ static inline int maxStrLen(SEXP x, int na_len) {
 #define DECIMAL_SEP '.' // TODO allow other decimal separator e.g. ','
 
 // Globals for this file only (written once to hold parameters passed from R level)
-static int na_len;
+static size_t na_len;
 static const char *na_str;
 
-static inline void writeInteger(int x, char **thisCh)
+static inline void writeInteger(long long x, char **thisCh)
 {
  char *ch = *thisCh;
- if (x == NA_INTEGER) {
- if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
- } else if (x == 0) {
+ // both integer and integer64 are passed to this function so careful
+ // to test for NA_INTEGER in the calling code. INT_MIN (NA_INTEGER) is
+ // a valid non-NA in integer64
+ if (x == 0) {
  *ch++ = '0';
  } else {
  if (x<0) { *ch++ = '-'; x=-x; }
@@ -126,7 +127,7 @@ static inline void writeNumeric(double x, char **thisCh)
  char *ch = *thisCh;
  if (!R_FINITE(x)) {
  if (ISNAN(x)) {
- if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }  // by default na_len==0 and the memcpy call will be skipped
+ memcpy(ch, na_str, na_len); ch += na_len; // by default na_len==0 and the memcpy call will be skipped
  } else if (x>0) {
  *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f';
  } else {
@@ -236,6 +237,16 @@ static inline void writeNumeric(double x, char **thisCh)
  *thisCh = ch;
 }
 
+inline Rboolean isInteger64(SEXP x) {
+ SEXP class = getAttrib(x, R_ClassSymbol);
+ if (isString(class)) {
+ for (int i=0; i<LENGTH(class); i++) { // inherits()
+ if (STRING_ELT(class, i) == char_integer64) return TRUE;
+ }
+ }
+ return FALSE;
+}
+
 SEXP writefile(SEXP list_of_columns,
  SEXP filenameArg,
  SEXP col_sep_Arg,
@@ -302,15 +313,17 @@ SEXP writefile(SEXP list_of_columns,
  SEXP levels[ncols]; // on-stack vla
  int lineLenMax = 2; // initialize with eol max width of \r\n on windows
  int sameType = TYPEOF(VECTOR_ELT(list_of_columns, 0));
+ Rboolean integer64[ncols]; // store result of isInteger64() per column for efficiency
  for (int col_i=0; col_i<ncols; col_i++) {
  SEXP column = VECTOR_ELT(list_of_columns, col_i);
- if (TYPEOF(column) != sameType) sameType = 0;
+ integer64[col_i] = FALSE;
  switch(TYPEOF(column)) {
  case LGLSXP:
  lineLenMax+=5; // width of FALSE
  break;
  case REALSXP:
- lineLenMax+=25; // +- 15digits dec e +- nnn = 22 + 3 safety = 25
+ integer64[col_i] = isInteger64(column);
+ lineLenMax+=25; // +- 15digits dec e +- nnn = 22 + 3 safety = 25. That covers int64 too (20 digits).
  break;
  case INTSXP:
  if (isFactor(column)) {
@@ -329,6 +342,9 @@ SEXP writefile(SEXP list_of_columns,
  default:
  error("Column %d's type is '%s' - not yet implemented.", col_i+1,type2char(TYPEOF(column)) );
  }
+ if (TYPEOF(column) != sameType || integer64[col_i]) sameType = 0;
+ // we could code up all-integer64 case below as well but that seems even less
+ // likely in practice than all-int or all-double
  lineLenMax++; // column separator
  }
  clock_t tlineLenMax=clock()-t0;
@@ -351,7 +367,7 @@ SEXP writefile(SEXP list_of_columns,
  for (int col_i=0; col_i<ncols; col_i++) {
  SEXP str = STRING_ELT(names, col_i);
  if (str==NA_STRING) {
- if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
+ memcpy(ch, na_str, na_len); ch += na_len;
  break;
  }
  if (quote) {
@@ -404,7 +420,7 @@ SEXP writefile(SEXP list_of_columns,
  int upp = start_row + rowsPerBatch;
  if (upp > nrows) upp = nrows;
  if (turbo && sameType == REALSXP) {
- // avoid deep switch. turbo switches on both sameType and specialized writeNumeric
+ // avoid deep switch() on type. turbo switches on both sameType and specialized writeNumeric
  for (RLEN row_i = start_row; row_i < upp; row_i++) {
  for (int col_i = 0; col_i < ncols; col_i++) {
  SEXP column = VECTOR_ELT(list_of_columns, col_i);
@@ -419,23 +435,28 @@ SEXP writefile(SEXP list_of_columns,
  for (RLEN row_i = start_row; row_i < upp; row_i++) {
  for (int col_i = 0; col_i < ncols; col_i++) {
  SEXP column = VECTOR_ELT(list_of_columns, col_i);
- writeInteger(INTEGER(column)[row_i], &ch);
+ if (INTEGER(column)[row_i] == NA_INTEGER) {
+ memcpy(ch, na_str, na_len); ch += na_len;
+ } else {
+ writeInteger(INTEGER(column)[row_i], &ch);
+ }
  *ch++ = col_sep;
  }
  ch--;
  memcpy(ch, row_sep, row_sep_len);
  ch += row_sep_len;
  }
  } else {
+ // mixed types. switch() on every cell value since must write row-by-row
  for (RLEN row_i = start_row; row_i < upp; row_i++) {
  for (int col_i = 0; col_i < ncols; col_i++) {
  SEXP column = VECTOR_ELT(list_of_columns, col_i);
- SEXP str;
+ SEXP str; // no declare within switch() allowed by C, otherwise would do
  switch(TYPEOF(column)) {
  case LGLSXP:
  true_false = LOGICAL(column)[row_i];
  if (true_false == NA_LOGICAL) {
- if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
+ memcpy(ch, na_str, na_len); ch += na_len;
  } else if (true_false) {
  memcpy(ch,"TRUE",4); // Other than strings, field widths are limited which we check elsewhere here to ensure
  ch += 4;
@@ -445,22 +466,33 @@ SEXP writefile(SEXP list_of_columns,
  }
  break;
  case REALSXP:
- if (ISNA(REAL(column)[row_i])) {
- if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
+ if (integer64[col_i]) {
+ long long i64 = *(long long *)&REAL(column)[row_i];
+ if (i64 == NAINT64) {
+ memcpy(ch, na_str, na_len); ch += na_len;
+ } else {
+ if (turbo) {
+ writeInteger(i64, &ch);
+ } else {
+ ch += sprintf(ch, "%lld", i64);
+ }
+ }
  } else {
  if (turbo) {
- // if there are any problems with the hand rolled double writing, then turbo=FALSE reverts to standard library
- writeNumeric(REAL(column)[row_i], &ch);
+ writeNumeric(REAL(column)[row_i], &ch); // handles NA, Inf etc within it
  } else {
- //tt0 = clock();
- ch += sprintf(ch, "%.15G", REAL(column)[row_i]);
- //tNUM += clock()-tt0;
+ // if there are any problems with the specialized writeNumeric, user can revert to (slower) standard library
+ if (ISNAN(REAL(column)[row_i])) {
+ memcpy(ch, na_str, na_len); ch += na_len;
+ } else {
+ ch += sprintf(ch, "%.15g", REAL(column)[row_i]);
+ }
  }
  }
  break;
  case INTSXP:
  if (INTEGER(column)[row_i] == NA_INTEGER) {
- if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
+ memcpy(ch, na_str, na_len); ch += na_len;
  } else if (levels[col_i] != NULL) { // isFactor(column) == TRUE
  str = STRING_ELT(levels[col_i], INTEGER(column)[row_i]-1);
  if (quote) {
@@ -480,7 +512,7 @@ SEXP writefile(SEXP list_of_columns,
  case STRSXP:
  str = STRING_ELT(column, row_i);
  if (str==NA_STRING) {
- if (na_len) { memcpy(ch, na_str, na_len); ch += na_len; }
+ memcpy(ch, na_str, na_len); ch += na_len;
  } else if (quote) {
  QUOTE_FIELD;
  } else {