Skip to content

Commit

Permalink
fwrite Date and IDate implemented, #1664. Including dateAs='yyyy-mm-d…
Browse files Browse the repository at this point in the history
…d'|'yyyymmdd'|'epoch'
  • Loading branch information
mattdowle committed Nov 9, 2016
1 parent cac3b6e commit de932e0
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 28 deletions.
10 changes: 7 additions & 3 deletions R/fwrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
sep=",", sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n",
na="", dec=".", row.names=FALSE, col.names=TRUE,
qmethod=c("double","escape"),
logicalAsInt=FALSE, buffMB=8, nThread=getDTthreads(),
logicalAsInt=FALSE, dateAs=c("yyyy-mm-dd","yyyymmdd","epoch"),
buffMB=8, nThread=getDTthreads(),
showProgress = getOption("datatable.showProgress"),
verbose = getOption("datatable.verbose"),
..turbo=TRUE) {
isLOGICAL = function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R?
na = as.character(na[1L]) # fix for #1725
if (missing(qmethod)) qmethod = qmethod[1L]
if (missing(dateAs)) dateAs = dateAs[1L]
buffMB = as.integer(buffMB)
nThread = as.integer(nThread)
# write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape'
Expand All @@ -20,7 +22,8 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
is.character(dec) && length(dec)==1L && nchar(dec) == 1L,
dec != sep, # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present
is.character(eol) && length(eol)==1L,
length(qmethod) == 1L && qmethod %in% c("double", "escape"),
length(qmethod) == 1L && qmethod %in% c("double", "escape"),
length(dateAs) == 1L && dateAs %in% c("yyyy-mm-dd","yyyymmdd","epoch"),
isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(row.names),
isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logicalAsInt),
length(na) == 1L, #1725, handles NULL or character(0) input
Expand All @@ -40,8 +43,9 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
nThread=1L
showProgress=FALSE
}
dateAs = chmatch(dateAs, c("yyyy-mm-dd","yyyymmdd","epoch"))-1L
.Call(Cwritefile, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
row.names, col.names, logicalAsInt, buffMB, nThread,
row.names, col.names, logicalAsInt, dateAs, buffMB, nThread,
showProgress, verbose, ..turbo)
invisible()
}
Expand Down
41 changes: 39 additions & 2 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -9589,10 +9589,47 @@ test(1737.3, fwrite(list(1.2,B="foo")), output=",B1.2,foo")
test(1737.4, fwrite(list("A,Name"=1.2,B="fo,o")), output="\"A,Name\",B1.2,\"fo,o\"")
test(1737.5, fwrite(list(1.2,B=c("foo","bar"))), error="Column 2's length (2) is not the same as column 1's length (1)")

# fwrite ITime
# fwrite ITime, Date, IDate
DT = data.table(A=as.ITime(c("23:59:58","23:59:59","12:00:00","00:00:01",NA,"00:00:00")))
test(1738.1, capture.output(fwrite(DT)), c("A","23:59:58","23:59:59","12:00:00","00:00:01","","00:00:00"))
test(1739.2, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE, na="")))
test(1738.2, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE, na="")))
dts = c("1901-05-17","1907-10-22","1929-10-24","1962-05-28","1987-10-19","2008-09-15",
"1968-12-30","1968-12-31","1969-01-01","1969-01-02")
DT = data.table(A=as.Date(dts), B=as.IDate(dts))
test(1738.3, sapply(DT,typeof), c(A="double",B="integer"))
test(1738.4, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)))
test(1738.5, as.integer(as.Date(c("0000-03-01","9999-12-31"))), c(-719468L,2932896L))

if (FALSE) {
# Full range takes too long for CRAN.
dts = seq.Date(as.Date("0000-03-01"),as.Date("9999-12-31"),by="day")
dtsCh = as.character(dts) # 36s
dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000
test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31")))
} else {
# test on CRAN a reduced but important range
dts = seq.Date(as.Date("1899-12-31"),as.Date("2100-01-01"),by="day")
dtsCh = as.character(dts)
test(1739.1, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01")))
}
DT = data.table(A=dts, B=as.IDate(dts))
test(1739.2, sapply(DT,typeof), c(A="double",B="integer"))
test(1739.3, typeof(dts), "double")
f = tempfile()
g = tempfile() # Full range
fwrite(DT,f) # 0.092s
write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s
test(1739.4, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=",")))
test(1739.5, readLines(f), readLines(g))
unlink(f)
unlink(g)

# dateAs
DT = data.table(as.Date(c("1901-05-17","1907-10-22","1970-01-01","1972-02-29","2020-01-01")))
test(1740.1, fwrite(DT,dateAs="yyyy-mm-d"), error="dateAs.*not TRUE")
test(1740.2, fwrite(DT,dateAs="yyyy-mm-dd"), output="1901-05-171907-10-221970-01-011972-02-292020-01-01")
test(1740.3, fwrite(DT,dateAs="yyyymmdd"), output="1901051719071022197001011972022920200101")
test(1740.4, capture.output(fwrite(DT,dateAs="epoch")), c("V1","-25066","-22717","0","789","18262"))


##########################
Expand Down
7 changes: 5 additions & 2 deletions man/fwrite.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ As \code{write.csv} but much faster (e.g. 2 seconds versus 1 minute) and just as
fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",", sep2 = c("","|",""),
eol = if (.Platform$OS.type=="windows") "\r\n" else "\n",
na = "", dec = ".", row.names = FALSE, col.names = TRUE, qmethod = c("double","escape"),
logicalAsInt = FALSE, buffMB = 8L, nThread = getDTthreads(),
logicalAsInt = FALSE, dateAs = c("yyyy-mm-dd", "yyyymmdd", "epoch"),
buffMB = 8L, nThread = getDTthreads(),
showProgress = getOption("datatable.showProgress"),
verbose = getOption("datatable.verbose"),
..turbo=TRUE)
Expand All @@ -31,19 +32,21 @@ fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",", sep2 = c("","|",
\item{"double" (default, same as \code{write.csv}), in which case the double quote is doubled with another one.}
}}
\item{logicalAsInt}{Should \code{logical} values be written as \code{1} and \code{0} rather than \code{"TRUE"} and \code{"FALSE"}?}
\item{dateAs}{The default is \code{"yyyy-mm-dd"} to match \code{write.csv}. \code{"yyyymmdd"} drops the separator so that dates can be read as integers in human readable format (that choice allows fast extraction of \code{yyyy}, \code{mm} and \code{dd} parts using \code{\%/\%10000}, \code{\%/\%100 \%\%100} and \code{\%\%100} respectively. Finally, \code{"epoch"} writes the underlying integer as-is: the number of days since the 1970-01-01 epoch (negative before that; see \code{?Date}). All three options are very fast due to new specialized C code. You should not be able to notice any difference in write speed between these options. The date range supported and tested is [0000-03-01, 9999-12-31] (3,652,365 dates including 2,790 leap days).}
\item{buffMB}{The buffer size (MB) per thread in the range 1 to 1024, default 8MB. Experiment to see what works best for your data on your hardware.}
\item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
\item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. }
\item{verbose}{Be chatty and report timings?}
\item{..turbo}{Use specialized custom C code to format numeric, integer and integer64 columns. This reduces call overhead to the C library and avoids copies. Try with and without to see the difference it makes on your machine and please report any differences in output. If you do find cases where \code{..turbo=FALSE} is needed, please report them as bugs, since this option WILL BE REMOVED in future. Hence why it has the \code{..} prefix.}
}
\details{
\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates, times and \code{sep2} for \code{list} columns where each cell can itself be a vector.
\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector.
}
\seealso{
\code{\link{setDTthreads}}, \code{\link{fread}}, \code{\link[utils]{write.csv}}, \code{\link[utils]{write.table}}, \href{https://CRAN.R-project.org/package=bit64}{\code{bit64::integer64}}
}
\references{
\url{http://howardhinnant.github.io/date_algorithms.html}
\url{https://en.wikipedia.org/wiki/Decimal_mark}
}
\examples{
Expand Down
Loading

0 comments on commit de932e0

Please sign in to comment.