Skip to content

Commit

Permalink
colClasses correspond to select (#3547)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdowle authored May 10, 2019
1 parent 6cfa03f commit fe2bfe7
Show file tree
Hide file tree
Showing 8 changed files with 253 additions and 153 deletions.
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
* `stringsAsFactors=0.10` will factorize any character column containing under `0.10*nrow` unique strings, [#2025](https://github.com/Rdatatable/data.table/issues/2025). Thanks to @hughparsonage for the PR.
* `colClasses=list(numeric=20:30, numeric="ID")` will apply the `numeric` type to column numbers `20:30` as before and now also column name `"ID"`; i.e. all duplicate class names are now respected rather than only the first. This need may arise when specifying some columns by name and others by number, as in this example. Thanks to @hughparsonage for the PR.
* gains `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files; i.e., csv files with metadata in a header formatted as YAML (http://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats. Please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/leeper/rio) for the inspiration and @MichaelChirico for implementing.
* `select` can now be used to specify types for just the columns selected, [#1426](https://github.com/Rdatatable/data.table/issues/1426). Just like `colClasses` it can be a named vector of `colname=type` pairs, or a named `list` of `type=col(s)` pairs. For example:

```R
fread(file, select=c(colD="character", colA="integer64")) # returns 2 columns: colD,colA
fread(file, select=list(character="colD", integer=8:10, character="colA")) # returns 5 columns: colD,8,9,10,colA
```

3. `fwrite()`:
* now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Compression, like `fwrite()`, is multithreaded and compresses each chunk on-the-fly (a full size intermediate file is not created). Use a ".gz" extension, or the new `compress=` option. Many thanks to Philippe Chataignon for the significant PR. For example:
Expand Down
20 changes: 3 additions & 17 deletions R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,9 @@ yaml=FALSE, autostart=NA)
warnings2errors = getOption("warn") >= 2
ans = .Call(CfreadR,input,sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,
fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros)
if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns
nr = length(ans[[1L]])
if ((!"bit64" %chin% loadedNamespaces()) && any(sapply(ans,inherits,"integer64"))) require_bit64()
if ((!"bit64" %chin% loadedNamespaces()) && any(sapply(ans,inherits,"integer64"))) require_bit64() # nocov
setattr(ans,"row.names",.set_row_names(nr))

if (isTRUE(data.table)) {
Expand Down Expand Up @@ -299,7 +300,7 @@ yaml=FALSE, autostart=NA)
methods::as(v, new_class))
},
warning = fun <- function(e) {
warning("Column '", names(ans)[j], "' was set by colClasses to be '", new_class, "' but fread encountered the following ",
warning("Column '", names(ans)[j], "' was requested to be '", new_class, "' but fread encountered the following ",
if (inherits(e, "error")) "error" else "warning", ":\n\t", e$message, "\nso the column has been left as type '", typeof(v), "'", call.=FALSE)
return(v)
},
Expand All @@ -319,21 +320,6 @@ yaml=FALSE, autostart=NA)
for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j)))
}

if (!is.null(select)) {
if (is.numeric(select)) {
if (length(o <- forderv(select))) {
rank = integer(length(o))
rank[o] = 1:length(o)
setcolorder(ans, rank)
}
} else {
if (!identical(names(ans), select)) {
reorder = select[select %chin% names(ans)] # any missing columns are warned about in freadR.c and skipped
setcolorder(ans, reorder)
}
}
}

if (!missing(col.names)) # FR #768
setnames(ans, col.names) # setnames checks and errors automatically
if (!is.null(key) && data.table) {
Expand Down
65 changes: 42 additions & 23 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -2703,13 +2703,14 @@ test(959.1, fread(input, colClasses=c("character","double","numeric")),
test(959.2, fread(input, colClasses=c("character",NA,"numeric")),
data.table(A=c("01","002"),B=c("foo","bar"),C=c(3.14,6.28)))
test(960, fread(input, colClasses=c("character","double")),
error="colClasses.*unnamed character vector.*length is 2. Must be length 1 or ncol \\(3 in this case\\) when unnamed")
error="colClasses= is an unnamed vector of types, length 2, but there are 3 columns.*you can")
test(961, fread(input, colClasses=1:3), error="colClasses is not type list or character vector")
test(962, fread(input, colClasses=list(1:3)), error="colClasses is type list but has no names")
test(963, fread(input, colClasses=list(character="D")), error="Column name 'D' in colClasses..1.. not found")
test(964, fread(input, colClasses=c(D="character")), error="Column name 'D' in colClasses..1.. not found")
test(965, fread(input, colClasses=list(character=0)), error="Column number 0 (colClasses[[1]][1]) is out of range [1,ncol=3]")
test(966, fread(input, colClasses=list(character=2:4)), error="Column number 4 (colClasses[[1]][3]) is out of range [1,ncol=3]")
test(963, fread(input, colClasses=list(character="D")), ans<-data.table(A=1:2, B=c("foo","bar"), C=c(3.14,6.28)), warning="Column name 'D' (colClasses[[1]][1]) not found")
test(964, fread(input, colClasses=c(D="character")), ans, warning="Column name 'D' (colClasses[[1]][1]) not found")
test(965, fread(input, colClasses=list(character=0)), ans, warning="Column number 0 (colClasses[[1]][1]) is out of range [1,ncol=3]")
test(966, fread(input, colClasses=list(character=2:4)), data.table(A=1:2, B=c("foo","bar"), C=c("3.140","6.28000")),
warning="Column number 4 (colClasses[[1]][3]) is out of range [1,ncol=3]")

# Character input more than 4096 bytes (used to be passed through path.expand which imposed the limit), #2649
test(967, nrow(fread( paste( rep('a\tb\n', 10000), collapse=''), header=FALSE)), 10000L)
Expand Down Expand Up @@ -3173,13 +3174,13 @@ test(1065, X[J(2:5), (var):=22L], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(2
# fread single unnamed colClasses
f = "A,B,C,D\n1,3,5,7\n2,4,6,8\n"
test(1066, fread(f,colClasses=c("integer","integer","character")),
error="colClasses is an unnamed character vector but its length is 3. Must be.*1 or ncol.*4")
error="colClasses= is an unnamed vector of types, length 3, but there are 4 columns.*you can")
test(1067, fread(f,colClasses=c("integer","numeric","character","character")), data.table(A=1:2,B=c(3,4),C=c("5","6"),D=c("7","8")))
test(1068, fread(f,colClasses="character"), data.table(A=c("1","2"),B=c("3","4"),C=c("5","6"),D=c("7","8")))

# fread select and drop
test(1069, fread(f,drop=c("D","B")), data.table(A=1:2,C=5:6))
test(1070, fread(f,drop="E"), fread(f), warning="Column name 'E' in 'drop' not found")
test(1070, fread(f,drop="E"), fread(f), warning="Column name 'E' (drop[1]) not found")
test(1071, fread(f,select="B",colClasses=list(numeric="C")), data.table(B=3:4))
test(1072, fread(f,select="B",drop="C"), error="not both")
test(1073, fread(f,drop=2:3), fread(f,select=c(1,4))) # tests coercing numeric select as well
Expand Down Expand Up @@ -10454,7 +10455,16 @@ test(1743.193, sapply(fread("a,b,c,d\n2,2,0f,x", colClasses = list(raw = c(1L, 3
test(1743.194, sapply(fread("a,b,c,d\n2,2,0f,x", colClasses = list(raw = c("a", "c"), Date = "d"), drop = c(1L, 4L)), class), y = c(b="integer", c="raw"))
test(1743.195, sapply(fread("a,b,c,d\n2,2,0f,x", colClasses = list(raw = c("a", "c"), Date = "d"), select = c(2L, 3L)), class), y = c(b="integer", c="raw"))
test(1743.196, sapply(fread("a,b,c,d\n2,0+1i,2,x", colClasses = list(raw = c("a", "c"), complex = "b", Date = "d"), select = c(2L, 3L)), class), y = c(b="complex", c="raw"))
test(1743.197, sapply(fread("A,B,C,D\nA,B,X,4", select = c(1, 4, 3, 2), colClasses = c("factor", "factor", "character", "integer")), class), c("A" = "factor", "D" = "integer", "C" = "character", "B" = "factor"))

# colClasses in select; #1426
test(1743.197, fread("A,B,C,D\nA,B,X,4", select=c(1,4,3,2), colClasses=c("factor","factor","character","integer")), ans<-data.table(A=factor("A"), D=4L, C="X", B=factor("B")))
test(1743.198, fread("A,B,C,D\nA,B,X,4", select=list(c(1,4,3,2), c("factor","integer","character","factor"))), error="select= is type list but has no names")
test(1743.199, fread("A,B,C,D\nA,B,X,4", select=c(A="factor", D="integer", C="character", B="factor")), ans)
test(1743.200, fread("A,B,C,D\nA,B,X,4", select=list(factor="A", integer=4, character="C", factor=2)), ans) # all 4 columns but in different order by list form of select
test(1743.201, fread("A,B,C,D\nA,B,X,4", select=list(factor="A"), colClasses="character"), error="select= is type list.*but colClasses= has been provided as well. Please remove colClasses.")
test(1743.202, fread("A,B,C,D\nA,B,X,4", select=c(A="factor"), colClasses="character"), error="select= is a named vector.*but colClasses= has been provided as well. Please remove colClasses=.")
test(1743.203, fread("A,B,C,D\nA,B,X,4", select=list(character="D", factor="B")), data.table(D="4", B=factor("B")))
test(1743.204, fread("A,B,C,D\nA,B,X,4", select=list(character=4, character=2)), data.table(D="4", B="B"))

## factors
test(1743.211, sapply(fread("a,b,c\n2,2,f", colClasses = list(factor = 1L), select = 2:3), class), y = c(b="integer", c="character"))
Expand Down Expand Up @@ -10489,17 +10499,23 @@ data1743 = "A,B,C,D\n1,3,5,7\n2,4,6,8\n"
test(1743.301, fread(data1743, colClasses=c("B"="NULL","C"="NULL")), ans<-data.table(A=1:2, D=7:8))
test(1743.302, fread(data1743, colClasses=list(NULL=c("B","C"))), ans)
test(1743.303, fread(data1743, drop=c("B","C")), ans)
test(1743.304, fread(data1743, drop=2:3), ans)
test(1743.3041, fread(data1743, drop=2:3), ans)
test(1743.3042, fread(data1743, drop=c(2,NA,3)), ans, warning="drop[2] is NA")
test(1743.3043, fread(data1743, colClasses=list(NULL=c(2,NA,3))), ans, warning="colClasses[[1]][2] is NA")
test(1743.305, fread(data1743, colClasses=c("integer", "NULL", "NULL", "integer")), ans)
test(1743.306, fread(data1743, colClasses=c("integer", "NULL", "NULL", "integer"), drop=4), data.table(A=1:2))
test(1743.307, fread(data1743, colClasses=list(NULL=c("C","D"), NULL=1:2)), data.table(A=1:2, B=3:4),
warning="There is more than one NULL item in colClasses= list. Ignoring all but the first.")
test(1743.308, fread(data1743, colClasses=list(NULL=c("C","D")), drop=1:2), data.table(C=5:6, D=7:8),
warning="Ignoring the NULL item in colClasses= because select= or drop= has been used")
test(1743.3071, fread(data1743, colClasses=list(NULL=c("C","D"), NULL=1:2)), data.table(NULL))
test(1743.3072, fread(data1743, colClasses=list(NULL=c("C","D"), NULL=1)), data.table(B=3:4))
test(1743.308, fread(data1743, colClasses=list(NULL=c("C","D")), drop=1:2), data.table(NULL))
test(1743.311, fread(data1743, colClasses="NULL"), ans<-data.table(A=1:2, B=3:4, C=5:6, D=7:8), warning="colClasses.*quoted.*interpreted as colClasses.*NULL")
test(1743.312, fread(data1743, colClasses=character()), ans)
test(1743.32, fread("A,B\na,0+1i", colClasses="complex"), data.table(A="a", B=1i),
warning="Column 'A' was set.*complex.*NAs introduced by coercion.*column has been left as.*character")
warning="Column 'A' was requested to be 'complex'.*NAs introduced by coercion.*column has been left as.*character")
test(1743.33, fread(data1743, colClasses=list("character"=4, "numeric"=c(2,NA,1))), data.table(A=c(1,2), B=c(3,4), C=5:6, D=c("7","8")), warning="colClasses[[2]][2] is NA")
test(1743.34, fread(data1743, select=list("character"=4, "numeric"=c(2,NA,1))), data.table(D=c("7","8"), B=c(3,4), A=c(1,2)), warning="colClasses[[2]][2] is NA")
old = options(warn=2)
test(1743.35, fread(data1743, select=list("character"=4, "numeric"=c(2,NA,1))), error="colClasses[[2]][2] is NA")
options(old)

# stringsAsFactors = double; #2025
fwrite(data.table(V1 = sample(letters, size=26*10, replace=TRUE),
Expand Down Expand Up @@ -10828,11 +10844,10 @@ test(1755, fread(testDir("unescaped.csv"), logical01=TRUE),
# test duplicated colClasses
txt = "A,B,C,D\n1,3,5,7\n2,4,6,8\n"
test(1756.1, fread(txt), data.table(A=1:2, B=3:4, C=5:6, D=7:8))
test(1756.2, fread(txt, colClasses=list('numeric'=c(1,3))), data.table(A=as.double(1:2), B=3:4, C=as.double(5:6), D=7:8))
test(1756.3, fread(txt, colClasses=list('numeric'=c(1,3,1))), error="Column 'A' appears more than once in colClasses")
test(1756.4, fread(txt, colClasses=list('numeric'=c(1,3),'character'=2)),
data.table(A=as.double(1:2), B=c("3","4"), C=as.double(5:6), D=7:8))
test(1756.5, fread(txt, colClasses=list('numeric'=c(1,2),'character'=2)), error="Column 'B' appears more than once")
test(1756.2, fread(txt, colClasses=list('numeric'=c(1,3))), ans<-data.table(A=as.double(1:2), B=3:4, C=as.double(5:6), D=7:8))
test(1756.3, fread(txt, colClasses=list('numeric'=c(1,3,1))), ans, warning="Column 1 ('A') appears more than once in colClasses. The second time is colClasses[[1]][3]")
test(1756.4, fread(txt, colClasses=list('numeric'=c(1,3),'character'=2)), ans<-data.table(A=as.double(1:2), B=c("3","4"), C=as.double(5:6), D=7:8))
test(1756.5, fread(txt, colClasses=list('numeric'=c(1,3),'character'=2:3)), ans, warning="Column 3 ('C') appears more than once in colClasses. The second time is colClasses[[2]][2]")

# Windows \r\n line endings when using multiple threads and detecting type within quoted fields, #2087
if (test_R.utils) {
Expand Down Expand Up @@ -11361,15 +11376,13 @@ test(1825.12, fread("a,b,c\n1,2,3.0\n2,3,4.5", colClasses = c("integer", "intege
warning="Attempt to override column 3 <<c>> of inherent type 'float64' down to 'int32'")
test(1825.13, fread("a,b,c\n1,2,3.0\n2,3,4.5", colClasses=list(NULL="b")), data.table(a=1:2, c=c(3,4.5)))
test(1825.14, fread(str, colClasses=list(integer=1:2, NULL=3:5)), data.table(x1=INT(1,3), x2=INT(2,4)))
test(1825.15, fread(str, colClasses=list(numeric=2, NULL=3:5), drop=1),
data.table(x2=c(2,4), x3=c(1.5,2.5), x4=c("T","F"), x5=c("cc","ff")),
warning="Ignoring the NULL item in colClasses= because select= or drop= has been used") # warning because drop != colClasses$`NULL`
test(1825.15, fread(str, colClasses=list(numeric=2, NULL=3:5), drop=1), data.table(x2=c(2,4)))
test(1825.16, fread(str, colClasses=(cl<-list(numeric=2, NULL=3:5)), drop=cl$`NULL`), data.table(x1=INT(1,3), x2=c(2,4))) # cover commit f0bd6e3
# NULL didn't work in 1.11.0-1.11.8 so some usage exists where drop= is used to respecify the NULLs. The warning could be reintroduced in future.
# https://github.com/Rdatatable/data.table/issues/3233#issuecomment-453674647
test(1825.17, fread(str, colClasses=c("integer","integer","NULL","character","NULL"), drop=3), data.table(x1=INT(1,3), x2=INT(2,4), x4=c("T","F")))
test(1825.18, fread(str, colClasses=c("integer","numeric","NULL","character","NULL"), drop=3:4), data.table(x1=INT(1,3), x2=c(2,4)))
test(1825.19, fread(str, drop=6), data.table(x1=INT(1,3), x2=INT(2,4), x3=c(1.5,2.5), x4=c("T","F"), x5=c("cc","ff")), warning="Column number 6 (drop[1]) is out of range [1,ncol=5]")
test(1825.19, fread(str, drop=6), data.table(x1=INT(1,3), x2=INT(2,4), x3=c(1.5,2.5), x4=c("T","F"), x5=c("cc","ff")), warning="drop[1] is 6 which is out of range [1,ncol=5]")
# extra tests from #3143
DT = data.table(a = c(1.0, 2.0, 3.0, 4.0, 5.1), b = c("1", "2", "E", "4", "5"))
fwrite(DT, f<-tempfile())
Expand Down Expand Up @@ -14387,6 +14400,12 @@ test(2033.1,
dcast(DT, a ~ b, value.var = list('c', 'd'), fun.aggregate = list(sum)),
error = "When 'fun.aggregate' and 'value.var' are both lists")

# fread no quote coverage
test(2034.1, fread('A,B\n"foo","ba"r"', quote="''"), error='quote= must be a single character, blank "", or FALSE')
test(2034.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"', B='"ba"r"'))
test(2034.3, fread('A,B\n"foo","ba"r"', quote=""), ans)


###################################
# Add new tests above this line #
###################################
Expand Down
Loading

0 comments on commit fe2bfe7

Please sign in to comment.