if (FALSE) { # \dontrun{
example(data.table) # to run these examples yourself
} # }
DF = data.frame(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
DT = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
DF
#> x y v
#> 1 b 1 1
#> 2 b 3 2
#> 3 b 6 3
#> 4 a 1 4
#> 5 a 3 5
#> 6 a 6 6
#> 7 c 1 7
#> 8 c 3 8
#> 9 c 6 9
DT
#> x y v
#> <char> <num> <int>
#> 1: b 1 1
#> 2: b 3 2
#> 3: b 6 3
#> 4: a 1 4
#> 5: a 3 5
#> 6: a 6 6
#> 7: c 1 7
#> 8: c 3 8
#> 9: c 6 9
identical(dim(DT), dim(DF)) # TRUE
#> [1] TRUE
identical(DF$a, DT$a) # TRUE
#> [1] TRUE
is.list(DF) # TRUE
#> [1] TRUE
is.list(DT) # TRUE
#> [1] TRUE
is.data.frame(DT) # TRUE
#> [1] TRUE
tables()
#> NAME NROW NCOL MB COLS KEY
#> 1: DT 9 3 0 x,y,v [NULL]
#> Total: 0MB using type_size
# basic row subset operations
DT[2] # 2nd row
#> x y v
#> <char> <num> <int>
#> 1: b 3 2
DT[3:2] # 3rd and 2nd row
#> x y v
#> <char> <num> <int>
#> 1: b 6 3
#> 2: b 3 2
DT[order(x)] # no need for order(DT$x)
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
#> 4: b 1 1
#> 5: b 3 2
#> 6: b 6 3
#> 7: c 1 7
#> 8: c 3 8
#> 9: c 6 9
DT[order(x), ] # same as above. The ',' is optional
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
#> 4: b 1 1
#> 5: b 3 2
#> 6: b 6 3
#> 7: c 1 7
#> 8: c 3 8
#> 9: c 6 9
DT[y>2] # all rows where DT$y > 2
#> x y v
#> <char> <num> <int>
#> 1: b 3 2
#> 2: b 6 3
#> 3: a 3 5
#> 4: a 6 6
#> 5: c 3 8
#> 6: c 6 9
DT[y>2 & v>5] # compound logical expressions
#> x y v
#> <char> <num> <int>
#> 1: a 6 6
#> 2: c 3 8
#> 3: c 6 9
DT[!2:4] # all rows other than 2:4
#> x y v
#> <char> <num> <int>
#> 1: b 1 1
#> 2: a 3 5
#> 3: a 6 6
#> 4: c 1 7
#> 5: c 3 8
#> 6: c 6 9
DT[-(2:4)] # same
#> x y v
#> <char> <num> <int>
#> 1: b 1 1
#> 2: a 3 5
#> 3: a 6 6
#> 4: c 1 7
#> 5: c 3 8
#> 6: c 6 9
# select|compute columns data.table way
DT[, v] # v column (as vector)
#> [1] 1 2 3 4 5 6 7 8 9
DT[, list(v)] # v column (as data.table)
#> v
#> <int>
#> 1: 1
#> 2: 2
#> 3: 3
#> 4: 4
#> 5: 5
#> 6: 6
#> 7: 7
#> 8: 8
#> 9: 9
DT[, .(v)] # same as above, .() is a shorthand alias to list()
#> v
#> <int>
#> 1: 1
#> 2: 2
#> 3: 3
#> 4: 4
#> 5: 5
#> 6: 6
#> 7: 7
#> 8: 8
#> 9: 9
DT[, sum(v)] # sum of column v, returned as vector
#> [1] 45
DT[, .(sum(v))] # same, but return data.table (column autonamed V1)
#> V1
#> <int>
#> 1: 45
DT[, .(sv=sum(v))] # same, but column named "sv"
#> sv
#> <int>
#> 1: 45
DT[, .(v, v*2)] # return two column data.table, v and v*2
#> v V2
#> <int> <num>
#> 1: 1 2
#> 2: 2 4
#> 3: 3 6
#> 4: 4 8
#> 5: 5 10
#> 6: 6 12
#> 7: 7 14
#> 8: 8 16
#> 9: 9 18
# subset rows and select|compute data.table way
DT[2:3, sum(v)] # sum(v) over rows 2 and 3, return vector
#> [1] 5
DT[2:3, .(sum(v))] # same, but return data.table with column V1
#> V1
#> <int>
#> 1: 5
DT[2:3, .(sv=sum(v))] # same, but return data.table with column sv
#> sv
#> <int>
#> 1: 5
DT[2:5, cat(v, "\n")] # just for j's side effect
#> 2 3 4 5
#> NULL
# select columns the data.frame way
DT[, 2] # 2nd column, returns a data.table always
#> y
#> <num>
#> 1: 1
#> 2: 3
#> 3: 6
#> 4: 1
#> 5: 3
#> 6: 6
#> 7: 1
#> 8: 3
#> 9: 6
colNum = 2
DT[, ..colNum] # same, .. prefix conveys one-level-up in calling scope
#> y
#> <num>
#> 1: 1
#> 2: 3
#> 3: 6
#> 4: 1
#> 5: 3
#> 6: 6
#> 7: 1
#> 8: 3
#> 9: 6
DT[["v"]] # same as DT[, v] but faster if called in a loop
#> [1] 1 2 3 4 5 6 7 8 9
# grouping operations - j and by
DT[, sum(v), by=x] # ad hoc by, order of groups preserved in result
#> x V1
#> <char> <int>
#> 1: b 6
#> 2: a 15
#> 3: c 24
DT[, sum(v), keyby=x] # same, but order the result on by cols
#> Key: <x>
#> x V1
#> <char> <int>
#> 1: a 15
#> 2: b 6
#> 3: c 24
DT[, sum(v), by=x, keyby=TRUE] # same, but using sorting flag
#> Key: <x>
#> x V1
#> <char> <int>
#> 1: a 15
#> 2: b 6
#> 3: c 24
DT[, sum(v), by=x][order(x)] # same but by chaining expressions together
#> x V1
#> <char> <int>
#> 1: a 15
#> 2: b 6
#> 3: c 24
# fast ad hoc row subsets (subsets as joins)
DT["a", on="x"] # same as x == "a" but uses binary search (fast)
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
# NB: requires DT to be keyed!
DT["a", on=.(x)] # same, for convenience, no need to quote every column
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
# NB: works regardless of whether or not DT is keyed!
DT[.("a"), on="x"] # same
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
DT[x=="a"] # same, single "==" internally optimised to use binary search (fast)
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
DT[x!="b" | y!=3] # not yet optimized, currently vector scan subset
#> x y v
#> <char> <num> <int>
#> 1: b 1 1
#> 2: b 6 3
#> 3: a 1 4
#> 4: a 3 5
#> 5: a 6 6
#> 6: c 1 7
#> 7: c 3 8
#> 8: c 6 9
DT[.("b", 3), on=c("x", "y")] # join on columns x,y of DT; uses binary search (fast)
#> x y v
#> <char> <num> <int>
#> 1: b 3 2
DT[.("b", 3), on=.(x, y)] # same, but using on=.()
#> x y v
#> <char> <num> <int>
#> 1: b 3 2
DT[.("b", 1:2), on=c("x", "y")] # no match returns NA
#> x y v
#> <char> <int> <int>
#> 1: b 1 1
#> 2: b 2 NA
DT[.("b", 1:2), on=.(x, y), nomatch=NULL] # no match row is not returned
#> x y v
#> <char> <int> <int>
#> 1: b 1 1
DT[.("b", 1:2), on=c("x", "y"), roll=Inf] # locf, nomatch row gets rolled by previous row
#> x y v
#> <char> <int> <int>
#> 1: b 1 1
#> 2: b 2 1
DT[.("b", 1:2), on=.(x, y), roll=-Inf] # nocb, nomatch row gets rolled by next row
#> x y v
#> <char> <int> <int>
#> 1: b 1 1
#> 2: b 2 2
DT["b", sum(v*y), on="x"] # on rows where DT$x=="b", calculate sum(v*y)
#> [1] 25
# all together now
DT[x!="a", sum(v), by=x] # get sum(v) by "x" for each i != "a"
#> x V1
#> <char> <int>
#> 1: b 6
#> 2: c 24
DT[!"a", sum(v), by=.EACHI, on="x"] # same, but using subsets-as-joins
#> x V1
#> <char> <int>
#> 1: b 6
#> 2: c 24
DT[c("b","c"), sum(v), by=.EACHI, on="x"] # same
#> x V1
#> <char> <int>
#> 1: b 6
#> 2: c 24
DT[c("b","c"), sum(v), by=.EACHI, on=.(x)] # same, using on=.()
#> x V1
#> <char> <int>
#> 1: b 6
#> 2: c 24
# joins as subsets
X = data.table(x=c("c","b"), v=8:7, foo=c(4,2))
X
#> x v foo
#> <char> <int> <num>
#> 1: c 8 4
#> 2: b 7 2
DT[X, on="x"] # right join
#> x y v i.v foo
#> <char> <num> <int> <int> <num>
#> 1: c 1 7 8 4
#> 2: c 3 8 8 4
#> 3: c 6 9 8 4
#> 4: b 1 1 7 2
#> 5: b 3 2 7 2
#> 6: b 6 3 7 2
X[DT, on="x"] # left join
#> x v foo y i.v
#> <char> <int> <num> <num> <int>
#> 1: b 7 2 1 1
#> 2: b 7 2 3 2
#> 3: b 7 2 6 3
#> 4: a NA NA 1 4
#> 5: a NA NA 3 5
#> 6: a NA NA 6 6
#> 7: c 8 4 1 7
#> 8: c 8 4 3 8
#> 9: c 8 4 6 9
DT[X, on="x", nomatch=NULL] # inner join
#> x y v i.v foo
#> <char> <num> <int> <int> <num>
#> 1: c 1 7 8 4
#> 2: c 3 8 8 4
#> 3: c 6 9 8 4
#> 4: b 1 1 7 2
#> 5: b 3 2 7 2
#> 6: b 6 3 7 2
DT[!X, on="x"] # not join
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
DT[X, on=c(y="v")] # join using column "y" of DT with column "v" of X
#> x y v i.x foo
#> <char> <int> <int> <char> <num>
#> 1: <NA> 8 NA c 4
#> 2: <NA> 7 NA b 2
DT[X, on="y==v"] # same as above (v1.9.8+)
#> x y v i.x foo
#> <char> <int> <int> <char> <num>
#> 1: <NA> 8 NA c 4
#> 2: <NA> 7 NA b 2
DT[X, on=.(y<=foo)] # NEW non-equi join (v1.9.8+)
#> x y v i.x i.v
#> <char> <num> <int> <char> <int>
#> 1: b 4 1 c 8
#> 2: b 4 2 c 8
#> 3: a 4 4 c 8
#> 4: a 4 5 c 8
#> 5: c 4 7 c 8
#> 6: c 4 8 c 8
#> 7: b 2 1 b 7
#> 8: a 2 4 b 7
#> 9: c 2 7 b 7
DT[X, on="y<=foo"] # same as above
#> x y v i.x i.v
#> <char> <num> <int> <char> <int>
#> 1: b 4 1 c 8
#> 2: b 4 2 c 8
#> 3: a 4 4 c 8
#> 4: a 4 5 c 8
#> 5: c 4 7 c 8
#> 6: c 4 8 c 8
#> 7: b 2 1 b 7
#> 8: a 2 4 b 7
#> 9: c 2 7 b 7
DT[X, on=c("y<=foo")] # same as above
#> x y v i.x i.v
#> <char> <num> <int> <char> <int>
#> 1: b 4 1 c 8
#> 2: b 4 2 c 8
#> 3: a 4 4 c 8
#> 4: a 4 5 c 8
#> 5: c 4 7 c 8
#> 6: c 4 8 c 8
#> 7: b 2 1 b 7
#> 8: a 2 4 b 7
#> 9: c 2 7 b 7
DT[X, on=.(y>=foo)] # NEW non-equi join (v1.9.8+)
#> x y v i.x i.v
#> <char> <num> <int> <char> <int>
#> 1: b 4 3 c 8
#> 2: a 4 6 c 8
#> 3: c 4 9 c 8
#> 4: b 2 2 b 7
#> 5: b 2 3 b 7
#> 6: a 2 5 b 7
#> 7: a 2 6 b 7
#> 8: c 2 8 b 7
#> 9: c 2 9 b 7
DT[X, on=.(x, y<=foo)] # NEW non-equi join (v1.9.8+)
#> x y v i.v
#> <char> <num> <int> <int>
#> 1: c 4 7 8
#> 2: c 4 8 8
#> 3: b 2 1 7
DT[X, .(x,y,x.y,v), on=.(x, y>=foo)] # Select x's join columns as well
#> x y x.y v
#> <char> <num> <num> <int>
#> 1: c 4 6 9
#> 2: b 2 3 2
#> 3: b 2 6 3
DT[X, on="x", mult="first"] # first row of each group
#> x y v i.v foo
#> <char> <num> <int> <int> <num>
#> 1: c 1 7 8 4
#> 2: b 1 1 7 2
DT[X, on="x", mult="last"] # last row of each group
#> x y v i.v foo
#> <char> <num> <int> <int> <num>
#> 1: c 6 9 8 4
#> 2: b 6 3 7 2
DT[X, sum(v), by=.EACHI, on="x"] # join and eval j for each row in i
#> x V1
#> <char> <int>
#> 1: c 24
#> 2: b 6
DT[X, sum(v)*foo, by=.EACHI, on="x"] # join inherited scope
#> x V1
#> <char> <num>
#> 1: c 96
#> 2: b 12
DT[X, sum(v)*i.v, by=.EACHI, on="x"] # 'i,v' refers to X's v column
#> x V1
#> <char> <int>
#> 1: c 192
#> 2: b 42
DT[X, on=.(x, v>=v), sum(y)*foo, by=.EACHI] # NEW non-equi join with by=.EACHI (v1.9.8+)
#> x v V1
#> <char> <int> <num>
#> 1: c 8 36
#> 2: b 7 NA
# setting keys
kDT = copy(DT) # (deep) copy DT to kDT to work with it.
setkey(kDT,x) # set a 1-column key. No quotes, for convenience.
setkeyv(kDT,"x") # same (v in setkeyv stands for vector)
v="x"
setkeyv(kDT,v) # same
haskey(kDT) # TRUE
#> [1] TRUE
key(kDT) # "x"
#> [1] "x"
# fast *keyed* subsets
kDT["a"] # subset-as-join on *key* column 'x'
#> Key: <x>
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
kDT["a", on="x"] # same, being explicit using 'on=' (preferred)
#> Key: <x>
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
# all together
kDT[!"a", sum(v), by=.EACHI] # get sum(v) for each i != "a"
#> Key: <x>
#> x V1
#> <char> <int>
#> 1: b 6
#> 2: c 24
# multi-column key
setkey(kDT,x,y) # 2-column key
setkeyv(kDT,c("x","y")) # same
# fast *keyed* subsets on multi-column key
kDT["a"] # join to 1st column of key
#> Key: <x, y>
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
kDT["a", on="x"] # on= is optional, but is preferred
#> Key: <x, y>
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
kDT[.("a")] # same, .() is an alias for list()
#> Key: <x, y>
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
kDT[list("a")] # same
#> Key: <x, y>
#> x y v
#> <char> <num> <int>
#> 1: a 1 4
#> 2: a 3 5
#> 3: a 6 6
kDT[.("a", 3)] # join to 2 columns
#> Key: <x, y>
#> x y v
#> <char> <num> <int>
#> 1: a 3 5
kDT[.("a", 3:6)] # join 4 rows (2 missing)
#> x y v
#> <char> <int> <int>
#> 1: a 3 5
#> 2: a 4 NA
#> 3: a 5 NA
#> 4: a 6 6
kDT[.("a", 3:6), nomatch=NULL] # remove missing
#> Key: <x, y>
#> x y v
#> <char> <int> <int>
#> 1: a 3 5
#> 2: a 6 6
kDT[.("a", 3:6), roll=TRUE] # locf rolling join
#> x y v
#> <char> <int> <int>
#> 1: a 3 5
#> 2: a 4 5
#> 3: a 5 5
#> 4: a 6 6
kDT[.("a", 3:6), roll=Inf] # same as above
#> x y v
#> <char> <int> <int>
#> 1: a 3 5
#> 2: a 4 5
#> 3: a 5 5
#> 4: a 6 6
kDT[.("a", 3:6), roll=-Inf] # nocb rolling join
#> x y v
#> <char> <int> <int>
#> 1: a 3 5
#> 2: a 4 6
#> 3: a 5 6
#> 4: a 6 6
kDT[!.("a")] # not join
#> Key: <x, y>
#> x y v
#> <char> <num> <int>
#> 1: b 1 1
#> 2: b 3 2
#> 3: b 6 3
#> 4: c 1 7
#> 5: c 3 8
#> 6: c 6 9
kDT[!"a"] # same
#> Key: <x, y>
#> x y v
#> <char> <num> <int>
#> 1: b 1 1
#> 2: b 3 2
#> 3: b 6 3
#> 4: c 1 7
#> 5: c 3 8
#> 6: c 6 9
# more on special symbols, see also ?"special-symbols"
DT[.N] # last row
#> x y v
#> <char> <num> <int>
#> 1: c 6 9
DT[, .N] # total number of rows in DT
#> [1] 9
DT[, .N, by=x] # number of rows in each group
#> x N
#> <char> <int>
#> 1: b 3
#> 2: a 3
#> 3: c 3
DT[, .SD, .SDcols=x:y] # select columns 'x' through 'y'
#> Index: <x>
#> x y
#> <char> <num>
#> 1: b 1
#> 2: b 3
#> 3: b 6
#> 4: a 1
#> 5: a 3
#> 6: a 6
#> 7: c 1
#> 8: c 3
#> 9: c 6
DT[ , .SD, .SDcols = !x:y] # drop columns 'x' through 'y'
#> v
#> <int>
#> 1: 1
#> 2: 2
#> 3: 3
#> 4: 4
#> 5: 5
#> 6: 6
#> 7: 7
#> 8: 8
#> 9: 9
DT[ , .SD, .SDcols = patterns('^[xv]')] # select columns matching '^x' or '^v'
#> Index: <x>
#> x v
#> <char> <int>
#> 1: b 1
#> 2: b 2
#> 3: b 3
#> 4: a 4
#> 5: a 5
#> 6: a 6
#> 7: c 7
#> 8: c 8
#> 9: c 9
DT[, .SD[1]] # first row of all columns
#> x y v
#> <char> <num> <int>
#> 1: b 1 1
DT[, .SD[1], by=x] # first row of 'y' and 'v' for each group in 'x'
#> x y v
#> <char> <num> <int>
#> 1: b 1 1
#> 2: a 1 4
#> 3: c 1 7
DT[, c(.N, lapply(.SD, sum)), by=x] # get rows *and* sum columns 'v' and 'y' by group
#> x N y v
#> <char> <int> <num> <int>
#> 1: b 3 10 6
#> 2: a 3 10 15
#> 3: c 3 10 24
DT[, .I[1], by=x] # row number in DT corresponding to each group
#> x V1
#> <char> <int>
#> 1: b 1
#> 2: a 4
#> 3: c 7
DT[, grp := .GRP, by=x] # add a group counter column
#> Index: <x>
#> x y v grp
#> <char> <num> <int> <int>
#> 1: b 1 1 1
#> 2: b 3 2 1
#> 3: b 6 3 1
#> 4: a 1 4 2
#> 5: a 3 5 2
#> 6: a 6 6 2
#> 7: c 1 7 3
#> 8: c 3 8 3
#> 9: c 6 9 3
DT[ , dput(.BY), by=.(x,y)] # .BY is a list of singletons for each group
#> list(x = "b", y = 1)
#> list(x = "b", y = 3)
#> list(x = "b", y = 6)
#> list(x = "a", y = 1)
#> list(x = "a", y = 3)
#> list(x = "a", y = 6)
#> list(x = "c", y = 1)
#> list(x = "c", y = 3)
#> list(x = "c", y = 6)
#> x y x y
#> <char> <num> <char> <num>
#> 1: b 1 b 1
#> 2: b 3 b 3
#> 3: b 6 b 6
#> 4: a 1 a 1
#> 5: a 3 a 3
#> 6: a 6 a 6
#> 7: c 1 c 1
#> 8: c 3 c 3
#> 9: c 6 c 6
X[, DT[.BY, y, on="x"], by=x] # join within each group
#> x V1
#> <char> <num>
#> 1: c 1
#> 2: c 3
#> 3: c 6
#> 4: b 1
#> 5: b 3
#> 6: b 6
DT[, {
# write each group to a different file
fwrite(.SD, file.path(tempdir(), paste0('x=', .BY$x, '.csv')))
}, by=x]
#> Empty data.table (0 rows and 1 cols): x
dir(tempdir())
#> [1] "bslib-c167c22974dd79c09b3195ad9df50566"
#> [2] "downlit"
#> [3] "file5fe710e4f9"
#> [4] "x=a.csv"
#> [5] "x=b.csv"
#> [6] "x=c.csv"
# add/update/delete by reference (see ?assign)
print(DT[, z:=42L]) # add new column by reference
#> Index: <x>
#> x y v grp z
#> <char> <num> <int> <int> <int>
#> 1: b 1 1 1 42
#> 2: b 3 2 1 42
#> 3: b 6 3 1 42
#> 4: a 1 4 2 42
#> 5: a 3 5 2 42
#> 6: a 6 6 2 42
#> 7: c 1 7 3 42
#> 8: c 3 8 3 42
#> 9: c 6 9 3 42
print(DT[, z:=NULL]) # remove column by reference
#> Index: <x>
#> x y v grp
#> <char> <num> <int> <int>
#> 1: b 1 1 1
#> 2: b 3 2 1
#> 3: b 6 3 1
#> 4: a 1 4 2
#> 5: a 3 5 2
#> 6: a 6 6 2
#> 7: c 1 7 3
#> 8: c 3 8 3
#> 9: c 6 9 3
print(DT["a", v:=42L, on="x"]) # subassign to existing v column by reference
#> Index: <x>
#> x y v grp
#> <char> <num> <int> <int>
#> 1: b 1 1 1
#> 2: b 3 2 1
#> 3: b 6 3 1
#> 4: a 1 42 2
#> 5: a 3 42 2
#> 6: a 6 42 2
#> 7: c 1 7 3
#> 8: c 3 8 3
#> 9: c 6 9 3
print(DT["b", v2:=84L, on="x"]) # subassign to new column by reference (NA padded)
#> Index: <x>
#> x y v grp v2
#> <char> <num> <int> <int> <int>
#> 1: b 1 1 1 84
#> 2: b 3 2 1 84
#> 3: b 6 3 1 84
#> 4: a 1 42 2 NA
#> 5: a 3 42 2 NA
#> 6: a 6 42 2 NA
#> 7: c 1 7 3 NA
#> 8: c 3 8 3 NA
#> 9: c 6 9 3 NA
DT[, m:=mean(v), by=x][] # add new column by reference by group
#> Index: <x>
#> x y v grp v2 m
#> <char> <num> <int> <int> <int> <num>
#> 1: b 1 1 1 84 2
#> 2: b 3 2 1 84 2
#> 3: b 6 3 1 84 2
#> 4: a 1 42 2 NA 42
#> 5: a 3 42 2 NA 42
#> 6: a 6 42 2 NA 42
#> 7: c 1 7 3 NA 8
#> 8: c 3 8 3 NA 8
#> 9: c 6 9 3 NA 8
# NB: postfix [] is shortcut to print()
# advanced usage
DT = data.table(x=rep(c("b","a","c"),each=3), v=c(1,1,1,2,2,1,1,2,2), y=c(1,3,6), a=1:9, b=9:1)
DT[, sum(v), by=.(y%%2)] # expressions in by
#> y V1
#> <num> <num>
#> 1: 1 9
#> 2: 0 4
DT[, sum(v), by=.(bool = y%%2)] # same, using a named list to change by column name
#> bool V1
#> <num> <num>
#> 1: 1 9
#> 2: 0 4
DT[, .SD[2], by=x] # get 2nd row of each group
#> x v y a b
#> <char> <num> <num> <int> <int>
#> 1: b 1 3 2 8
#> 2: a 2 3 5 5
#> 3: c 2 3 8 2
DT[, tail(.SD,2), by=x] # last 2 rows of each group
#> x v y a b
#> <char> <num> <num> <int> <int>
#> 1: b 1 3 2 8
#> 2: b 1 6 3 7
#> 3: a 2 3 5 5
#> 4: a 1 6 6 4
#> 5: c 2 3 8 2
#> 6: c 2 6 9 1
DT[, lapply(.SD, sum), by=x] # sum of all (other) columns for each group
#> x v y a b
#> <char> <num> <num> <int> <int>
#> 1: b 3 10 6 24
#> 2: a 5 10 15 15
#> 3: c 5 10 24 6
DT[, .SD[which.min(v)], by=x] # nested query by group
#> x v y a b
#> <char> <num> <num> <int> <int>
#> 1: b 1 1 1 9
#> 2: a 1 6 6 4
#> 3: c 1 1 7 3
DT[, list(MySum=sum(v),
MyMin=min(v),
MyMax=max(v)),
by=.(x, y%%2)] # by 2 expressions
#> x y MySum MyMin MyMax
#> <char> <num> <num> <num> <num>
#> 1: b 1 2 1 1
#> 2: b 0 1 1 1
#> 3: a 1 4 2 2
#> 4: a 0 1 1 1
#> 5: c 1 3 1 2
#> 6: c 0 2 2 2
DT[, .(a = .(a), b = .(b)), by=x] # list columns
#> x a b
#> <char> <list> <list>
#> 1: b 1,2,3 9,8,7
#> 2: a 4,5,6 6,5,4
#> 3: c 7,8,9 3,2,1
DT[, .(seq = min(a):max(b)), by=x] # j is not limited to just aggregations
#> x seq
#> <char> <int>
#> 1: b 1
#> 2: b 2
#> 3: b 3
#> 4: b 4
#> 5: b 5
#> 6: b 6
#> 7: b 7
#> 8: b 8
#> 9: b 9
#> 10: a 4
#> 11: a 5
#> 12: a 6
#> 13: c 7
#> 14: c 6
#> 15: c 5
#> 16: c 4
#> 17: c 3
DT[, sum(v), by=x][V1<20] # compound query
#> x V1
#> <char> <num>
#> 1: b 3
#> 2: a 5
#> 3: c 5
DT[, sum(v), by=x][order(-V1)] # ordering results
#> x V1
#> <char> <num>
#> 1: a 5
#> 2: c 5
#> 3: b 3
DT[, c(.N, lapply(.SD,sum)), by=x] # get number of observations and sum per group
#> x N v y a b
#> <char> <int> <num> <num> <int> <int>
#> 1: b 3 3 10 6 24
#> 2: a 3 5 10 15 15
#> 3: c 3 5 10 24 6
DT[, {tmp <- mean(y);
.(a = a-tmp, b = b-tmp)
}, by=x] # anonymous lambda in 'j', j accepts any valid
#> x a b
#> <char> <num> <num>
#> 1: b -2.3333333 5.6666667
#> 2: b -1.3333333 4.6666667
#> 3: b -0.3333333 3.6666667
#> 4: a 0.6666667 2.6666667
#> 5: a 1.6666667 1.6666667
#> 6: a 2.6666667 0.6666667
#> 7: c 3.6666667 -0.3333333
#> 8: c 4.6666667 -1.3333333
#> 9: c 5.6666667 -2.3333333
# expression. TO REMEMBER: every element of
# the list becomes a column in result.
pdf("new.pdf")
DT[, plot(a,b), by=x] # can also plot in 'j'
#> Empty data.table (0 rows and 1 cols): x
dev.off()
#> pdf
#> 2
# using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v'
DT[, c(.(y=max(y)), lapply(.SD, min)), by=rleid(v), .SDcols=v:b]
#> rleid y v y a b
#> <int> <num> <num> <num> <int> <int>
#> 1: 1 6 1 1 1 7
#> 2: 2 3 2 1 4 5
#> 3: 3 6 1 1 6 3
#> 4: 4 6 2 3 8 1
# Support guide and links:
# https://github.com/Rdatatable/data.table/wiki/Support
if (FALSE) { # \dontrun{
if (interactive()) {
vignette(package="data.table") # 9 vignettes
test.data.table() # 6,000 tests
# keep up to date with latest stable version on CRAN
update.packages()
# get the latest devel version that has passed all tests
update_dev_pkg()
# read more at:
# https://github.com/Rdatatable/data.table/wiki/Installation
}
} # }