Enhanced data.frame — data.table-package (original) (raw)

if (FALSE) { # \dontrun{
example(data.table)  # to run these examples yourself
} # }
DF = data.frame(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
DT = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
DF
#>   x y v
#> 1 b 1 1
#> 2 b 3 2
#> 3 b 6 3
#> 4 a 1 4
#> 5 a 3 5
#> 6 a 6 6
#> 7 c 1 7
#> 8 c 3 8
#> 9 c 6 9
DT
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     1     1
#> 2:      b     3     2
#> 3:      b     6     3
#> 4:      a     1     4
#> 5:      a     3     5
#> 6:      a     6     6
#> 7:      c     1     7
#> 8:      c     3     8
#> 9:      c     6     9
identical(dim(DT), dim(DF))    # TRUE
#> [1] TRUE
identical(DF$a, DT$a)          # TRUE
#> [1] TRUE
is.list(DF)                    # TRUE
#> [1] TRUE
is.list(DT)                    # TRUE
#> [1] TRUE

is.data.frame(DT)              # TRUE
#> [1] TRUE

tables()
#>    NAME NROW NCOL MB  COLS    KEY
#> 1:   DT    9    3  0 x,y,v [NULL]
#> Total: 0MB using type_size

# basic row subset operations
DT[2]                          # 2nd row
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     3     2
DT[3:2]                        # 3rd and 2nd row
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     6     3
#> 2:      b     3     2
DT[order(x)]                   # no need for order(DT$x)
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
#> 4:      b     1     1
#> 5:      b     3     2
#> 6:      b     6     3
#> 7:      c     1     7
#> 8:      c     3     8
#> 9:      c     6     9
DT[order(x), ]                 # same as above. The ',' is optional
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
#> 4:      b     1     1
#> 5:      b     3     2
#> 6:      b     6     3
#> 7:      c     1     7
#> 8:      c     3     8
#> 9:      c     6     9
DT[y>2]                        # all rows where DT$y > 2
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     3     2
#> 2:      b     6     3
#> 3:      a     3     5
#> 4:      a     6     6
#> 5:      c     3     8
#> 6:      c     6     9
DT[y>2 & v>5]                  # compound logical expressions
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     6     6
#> 2:      c     3     8
#> 3:      c     6     9
DT[!2:4]                       # all rows other than 2:4
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     1     1
#> 2:      a     3     5
#> 3:      a     6     6
#> 4:      c     1     7
#> 5:      c     3     8
#> 6:      c     6     9
DT[-(2:4)]                     # same
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     1     1
#> 2:      a     3     5
#> 3:      a     6     6
#> 4:      c     1     7
#> 5:      c     3     8
#> 6:      c     6     9

# select|compute columns data.table way
DT[, v]                        # v column (as vector)
#> [1] 1 2 3 4 5 6 7 8 9
DT[, list(v)]                  # v column (as data.table)
#>        v
#>    <int>
#> 1:     1
#> 2:     2
#> 3:     3
#> 4:     4
#> 5:     5
#> 6:     6
#> 7:     7
#> 8:     8
#> 9:     9
DT[, .(v)]                     # same as above, .() is a shorthand alias to list()
#>        v
#>    <int>
#> 1:     1
#> 2:     2
#> 3:     3
#> 4:     4
#> 5:     5
#> 6:     6
#> 7:     7
#> 8:     8
#> 9:     9
DT[, sum(v)]                   # sum of column v, returned as vector
#> [1] 45
DT[, .(sum(v))]                # same, but return data.table (column autonamed V1)
#>       V1
#>    <int>
#> 1:    45
DT[, .(sv=sum(v))]             # same, but column named "sv"
#>       sv
#>    <int>
#> 1:    45
DT[, .(v, v*2)]                # return two column data.table, v and v*2
#>        v    V2
#>    <int> <num>
#> 1:     1     2
#> 2:     2     4
#> 3:     3     6
#> 4:     4     8
#> 5:     5    10
#> 6:     6    12
#> 7:     7    14
#> 8:     8    16
#> 9:     9    18

# subset rows and select|compute data.table way
DT[2:3, sum(v)]                # sum(v) over rows 2 and 3, return vector
#> [1] 5
DT[2:3, .(sum(v))]             # same, but return data.table with column V1
#>       V1
#>    <int>
#> 1:     5
DT[2:3, .(sv=sum(v))]          # same, but return data.table with column sv
#>       sv
#>    <int>
#> 1:     5
DT[2:5, cat(v, "\n")]          # just for j's side effect
#> 2 3 4 5 
#> NULL

# select columns the data.frame way
DT[, 2]                        # 2nd column, returns a data.table always
#>        y
#>    <num>
#> 1:     1
#> 2:     3
#> 3:     6
#> 4:     1
#> 5:     3
#> 6:     6
#> 7:     1
#> 8:     3
#> 9:     6
colNum = 2
DT[, ..colNum]                 # same, .. prefix conveys one-level-up in calling scope
#>        y
#>    <num>
#> 1:     1
#> 2:     3
#> 3:     6
#> 4:     1
#> 5:     3
#> 6:     6
#> 7:     1
#> 8:     3
#> 9:     6
DT[["v"]]                      # same as DT[, v] but faster if called in a loop
#> [1] 1 2 3 4 5 6 7 8 9

# grouping operations - j and by
DT[, sum(v), by=x]             # ad hoc by, order of groups preserved in result
#>         x    V1
#>    <char> <int>
#> 1:      b     6
#> 2:      a    15
#> 3:      c    24
DT[, sum(v), keyby=x]          # same, but order the result on by cols
#> Key: <x>
#>         x    V1
#>    <char> <int>
#> 1:      a    15
#> 2:      b     6
#> 3:      c    24
DT[, sum(v), by=x, keyby=TRUE] # same, but using sorting flag
#> Key: <x>
#>         x    V1
#>    <char> <int>
#> 1:      a    15
#> 2:      b     6
#> 3:      c    24
DT[, sum(v), by=x][order(x)]   # same but by chaining expressions together
#>         x    V1
#>    <char> <int>
#> 1:      a    15
#> 2:      b     6
#> 3:      c    24

# fast ad hoc row subsets (subsets as joins)
DT["a", on="x"]                # same as x == "a" but uses binary search (fast)
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
                               #   NB: requires DT to be keyed!
DT["a", on=.(x)]               # same, for convenience, no need to quote every column
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
                               #   NB: works regardless of whether or not DT is keyed!
DT[.("a"), on="x"]             # same
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
DT[x=="a"]                     # same, single "==" internally optimised to use binary search (fast)
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
DT[x!="b" | y!=3]              # not yet optimized, currently vector scan subset
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     1     1
#> 2:      b     6     3
#> 3:      a     1     4
#> 4:      a     3     5
#> 5:      a     6     6
#> 6:      c     1     7
#> 7:      c     3     8
#> 8:      c     6     9
DT[.("b", 3), on=c("x", "y")]  # join on columns x,y of DT; uses binary search (fast)
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     3     2
DT[.("b", 3), on=.(x, y)]      # same, but using on=.()
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     3     2
DT[.("b", 1:2), on=c("x", "y")]             # no match returns NA
#>         x     y     v
#>    <char> <int> <int>
#> 1:      b     1     1
#> 2:      b     2    NA
DT[.("b", 1:2), on=.(x, y), nomatch=NULL]   # no match row is not returned
#>         x     y     v
#>    <char> <int> <int>
#> 1:      b     1     1
DT[.("b", 1:2), on=c("x", "y"), roll=Inf]   # locf, nomatch row gets rolled by previous row
#>         x     y     v
#>    <char> <int> <int>
#> 1:      b     1     1
#> 2:      b     2     1
DT[.("b", 1:2), on=.(x, y), roll=-Inf]      # nocb, nomatch row gets rolled by next row
#>         x     y     v
#>    <char> <int> <int>
#> 1:      b     1     1
#> 2:      b     2     2
DT["b", sum(v*y), on="x"]                   # on rows where DT$x=="b", calculate sum(v*y)
#> [1] 25

# all together now
DT[x!="a", sum(v), by=x]                    # get sum(v) by "x" for each i != "a"
#>         x    V1
#>    <char> <int>
#> 1:      b     6
#> 2:      c    24
DT[!"a", sum(v), by=.EACHI, on="x"]         # same, but using subsets-as-joins
#>         x    V1
#>    <char> <int>
#> 1:      b     6
#> 2:      c    24
DT[c("b","c"), sum(v), by=.EACHI, on="x"]   # same
#>         x    V1
#>    <char> <int>
#> 1:      b     6
#> 2:      c    24
DT[c("b","c"), sum(v), by=.EACHI, on=.(x)]  # same, using on=.()
#>         x    V1
#>    <char> <int>
#> 1:      b     6
#> 2:      c    24

# joins as subsets
X = data.table(x=c("c","b"), v=8:7, foo=c(4,2))
X
#>         x     v   foo
#>    <char> <int> <num>
#> 1:      c     8     4
#> 2:      b     7     2

DT[X, on="x"]                         # right join
#>         x     y     v   i.v   foo
#>    <char> <num> <int> <int> <num>
#> 1:      c     1     7     8     4
#> 2:      c     3     8     8     4
#> 3:      c     6     9     8     4
#> 4:      b     1     1     7     2
#> 5:      b     3     2     7     2
#> 6:      b     6     3     7     2
X[DT, on="x"]                         # left join
#>         x     v   foo     y   i.v
#>    <char> <int> <num> <num> <int>
#> 1:      b     7     2     1     1
#> 2:      b     7     2     3     2
#> 3:      b     7     2     6     3
#> 4:      a    NA    NA     1     4
#> 5:      a    NA    NA     3     5
#> 6:      a    NA    NA     6     6
#> 7:      c     8     4     1     7
#> 8:      c     8     4     3     8
#> 9:      c     8     4     6     9
DT[X, on="x", nomatch=NULL]           # inner join
#>         x     y     v   i.v   foo
#>    <char> <num> <int> <int> <num>
#> 1:      c     1     7     8     4
#> 2:      c     3     8     8     4
#> 3:      c     6     9     8     4
#> 4:      b     1     1     7     2
#> 5:      b     3     2     7     2
#> 6:      b     6     3     7     2
DT[!X, on="x"]                        # not join
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
DT[X, on=c(y="v")]                    # join using column "y" of DT with column "v" of X
#>         x     y     v    i.x   foo
#>    <char> <int> <int> <char> <num>
#> 1:   <NA>     8    NA      c     4
#> 2:   <NA>     7    NA      b     2
DT[X, on="y==v"]                      # same as above (v1.9.8+)
#>         x     y     v    i.x   foo
#>    <char> <int> <int> <char> <num>
#> 1:   <NA>     8    NA      c     4
#> 2:   <NA>     7    NA      b     2

DT[X, on=.(y<=foo)]                   # NEW non-equi join (v1.9.8+)
#>         x     y     v    i.x   i.v
#>    <char> <num> <int> <char> <int>
#> 1:      b     4     1      c     8
#> 2:      b     4     2      c     8
#> 3:      a     4     4      c     8
#> 4:      a     4     5      c     8
#> 5:      c     4     7      c     8
#> 6:      c     4     8      c     8
#> 7:      b     2     1      b     7
#> 8:      a     2     4      b     7
#> 9:      c     2     7      b     7
DT[X, on="y<=foo"]                    # same as above
#>         x     y     v    i.x   i.v
#>    <char> <num> <int> <char> <int>
#> 1:      b     4     1      c     8
#> 2:      b     4     2      c     8
#> 3:      a     4     4      c     8
#> 4:      a     4     5      c     8
#> 5:      c     4     7      c     8
#> 6:      c     4     8      c     8
#> 7:      b     2     1      b     7
#> 8:      a     2     4      b     7
#> 9:      c     2     7      b     7
DT[X, on=c("y<=foo")]                 # same as above
#>         x     y     v    i.x   i.v
#>    <char> <num> <int> <char> <int>
#> 1:      b     4     1      c     8
#> 2:      b     4     2      c     8
#> 3:      a     4     4      c     8
#> 4:      a     4     5      c     8
#> 5:      c     4     7      c     8
#> 6:      c     4     8      c     8
#> 7:      b     2     1      b     7
#> 8:      a     2     4      b     7
#> 9:      c     2     7      b     7
DT[X, on=.(y>=foo)]                   # NEW non-equi join (v1.9.8+)
#>         x     y     v    i.x   i.v
#>    <char> <num> <int> <char> <int>
#> 1:      b     4     3      c     8
#> 2:      a     4     6      c     8
#> 3:      c     4     9      c     8
#> 4:      b     2     2      b     7
#> 5:      b     2     3      b     7
#> 6:      a     2     5      b     7
#> 7:      a     2     6      b     7
#> 8:      c     2     8      b     7
#> 9:      c     2     9      b     7
DT[X, on=.(x, y<=foo)]                # NEW non-equi join (v1.9.8+)
#>         x     y     v   i.v
#>    <char> <num> <int> <int>
#> 1:      c     4     7     8
#> 2:      c     4     8     8
#> 3:      b     2     1     7
DT[X, .(x,y,x.y,v), on=.(x, y>=foo)]  # Select x's join columns as well
#>         x     y   x.y     v
#>    <char> <num> <num> <int>
#> 1:      c     4     6     9
#> 2:      b     2     3     2
#> 3:      b     2     6     3

DT[X, on="x", mult="first"]           # first row of each group
#>         x     y     v   i.v   foo
#>    <char> <num> <int> <int> <num>
#> 1:      c     1     7     8     4
#> 2:      b     1     1     7     2
DT[X, on="x", mult="last"]            # last row of each group
#>         x     y     v   i.v   foo
#>    <char> <num> <int> <int> <num>
#> 1:      c     6     9     8     4
#> 2:      b     6     3     7     2
DT[X, sum(v), by=.EACHI, on="x"]      # join and eval j for each row in i
#>         x    V1
#>    <char> <int>
#> 1:      c    24
#> 2:      b     6
DT[X, sum(v)*foo, by=.EACHI, on="x"]  # join inherited scope
#>         x    V1
#>    <char> <num>
#> 1:      c    96
#> 2:      b    12
DT[X, sum(v)*i.v, by=.EACHI, on="x"]  # 'i,v' refers to X's v column
#>         x    V1
#>    <char> <int>
#> 1:      c   192
#> 2:      b    42
DT[X, on=.(x, v>=v), sum(y)*foo, by=.EACHI] # NEW non-equi join with by=.EACHI (v1.9.8+)
#>         x     v    V1
#>    <char> <int> <num>
#> 1:      c     8    36
#> 2:      b     7    NA

# setting keys
kDT = copy(DT)                        # (deep) copy DT to kDT to work with it.
setkey(kDT,x)                         # set a 1-column key. No quotes, for convenience.
setkeyv(kDT,"x")                      # same (v in setkeyv stands for vector)
v="x"
setkeyv(kDT,v)                        # same
haskey(kDT)                           # TRUE
#> [1] TRUE
key(kDT)                              # "x"
#> [1] "x"

# fast *keyed* subsets
kDT["a"]                              # subset-as-join on *key* column 'x'
#> Key: <x>
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
kDT["a", on="x"]                      # same, being explicit using 'on=' (preferred)
#> Key: <x>
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6

# all together
kDT[!"a", sum(v), by=.EACHI]          # get sum(v) for each i != "a"
#> Key: <x>
#>         x    V1
#>    <char> <int>
#> 1:      b     6
#> 2:      c    24

# multi-column key
setkey(kDT,x,y)                       # 2-column key
setkeyv(kDT,c("x","y"))               # same

# fast *keyed* subsets on multi-column key
kDT["a"]                              # join to 1st column of key
#> Key: <x, y>
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
kDT["a", on="x"]                      # on= is optional, but is preferred
#> Key: <x, y>
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
kDT[.("a")]                           # same, .() is an alias for list()
#> Key: <x, y>
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
kDT[list("a")]                        # same
#> Key: <x, y>
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     1     4
#> 2:      a     3     5
#> 3:      a     6     6
kDT[.("a", 3)]                        # join to 2 columns
#> Key: <x, y>
#>         x     y     v
#>    <char> <num> <int>
#> 1:      a     3     5
kDT[.("a", 3:6)]                      # join 4 rows (2 missing)
#>         x     y     v
#>    <char> <int> <int>
#> 1:      a     3     5
#> 2:      a     4    NA
#> 3:      a     5    NA
#> 4:      a     6     6
kDT[.("a", 3:6), nomatch=NULL]        # remove missing
#> Key: <x, y>
#>         x     y     v
#>    <char> <int> <int>
#> 1:      a     3     5
#> 2:      a     6     6
kDT[.("a", 3:6), roll=TRUE]           # locf rolling join
#>         x     y     v
#>    <char> <int> <int>
#> 1:      a     3     5
#> 2:      a     4     5
#> 3:      a     5     5
#> 4:      a     6     6
kDT[.("a", 3:6), roll=Inf]            # same as above
#>         x     y     v
#>    <char> <int> <int>
#> 1:      a     3     5
#> 2:      a     4     5
#> 3:      a     5     5
#> 4:      a     6     6
kDT[.("a", 3:6), roll=-Inf]           # nocb rolling join
#>         x     y     v
#>    <char> <int> <int>
#> 1:      a     3     5
#> 2:      a     4     6
#> 3:      a     5     6
#> 4:      a     6     6
kDT[!.("a")]                          # not join
#> Key: <x, y>
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     1     1
#> 2:      b     3     2
#> 3:      b     6     3
#> 4:      c     1     7
#> 5:      c     3     8
#> 6:      c     6     9
kDT[!"a"]                             # same
#> Key: <x, y>
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     1     1
#> 2:      b     3     2
#> 3:      b     6     3
#> 4:      c     1     7
#> 5:      c     3     8
#> 6:      c     6     9

# more on special symbols, see also ?"special-symbols"
DT[.N]                                  # last row
#>         x     y     v
#>    <char> <num> <int>
#> 1:      c     6     9
DT[, .N]                                # total number of rows in DT
#> [1] 9
DT[, .N, by=x]                          # number of rows in each group
#>         x     N
#>    <char> <int>
#> 1:      b     3
#> 2:      a     3
#> 3:      c     3
DT[, .SD, .SDcols=x:y]                  # select columns 'x' through 'y'
#> Index: <x>
#>         x     y
#>    <char> <num>
#> 1:      b     1
#> 2:      b     3
#> 3:      b     6
#> 4:      a     1
#> 5:      a     3
#> 6:      a     6
#> 7:      c     1
#> 8:      c     3
#> 9:      c     6
DT[ , .SD, .SDcols = !x:y]              # drop columns 'x' through 'y'
#>        v
#>    <int>
#> 1:     1
#> 2:     2
#> 3:     3
#> 4:     4
#> 5:     5
#> 6:     6
#> 7:     7
#> 8:     8
#> 9:     9
DT[ , .SD, .SDcols = patterns('^[xv]')] # select columns matching '^x' or '^v'
#> Index: <x>
#>         x     v
#>    <char> <int>
#> 1:      b     1
#> 2:      b     2
#> 3:      b     3
#> 4:      a     4
#> 5:      a     5
#> 6:      a     6
#> 7:      c     7
#> 8:      c     8
#> 9:      c     9
DT[, .SD[1]]                            # first row of all columns
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     1     1
DT[, .SD[1], by=x]                      # first row of 'y' and 'v' for each group in 'x'
#>         x     y     v
#>    <char> <num> <int>
#> 1:      b     1     1
#> 2:      a     1     4
#> 3:      c     1     7
DT[, c(.N, lapply(.SD, sum)), by=x]     # get rows *and* sum columns 'v' and 'y' by group
#>         x     N     y     v
#>    <char> <int> <num> <int>
#> 1:      b     3    10     6
#> 2:      a     3    10    15
#> 3:      c     3    10    24
DT[, .I[1], by=x]                       # row number in DT corresponding to each group
#>         x    V1
#>    <char> <int>
#> 1:      b     1
#> 2:      a     4
#> 3:      c     7
DT[, grp := .GRP, by=x]                 # add a group counter column
#> Index: <x>
#>         x     y     v   grp
#>    <char> <num> <int> <int>
#> 1:      b     1     1     1
#> 2:      b     3     2     1
#> 3:      b     6     3     1
#> 4:      a     1     4     2
#> 5:      a     3     5     2
#> 6:      a     6     6     2
#> 7:      c     1     7     3
#> 8:      c     3     8     3
#> 9:      c     6     9     3
DT[ , dput(.BY), by=.(x,y)]             # .BY is a list of singletons for each group
#> list(x = "b", y = 1)
#> list(x = "b", y = 3)
#> list(x = "b", y = 6)
#> list(x = "a", y = 1)
#> list(x = "a", y = 3)
#> list(x = "a", y = 6)
#> list(x = "c", y = 1)
#> list(x = "c", y = 3)
#> list(x = "c", y = 6)
#>         x     y      x     y
#>    <char> <num> <char> <num>
#> 1:      b     1      b     1
#> 2:      b     3      b     3
#> 3:      b     6      b     6
#> 4:      a     1      a     1
#> 5:      a     3      a     3
#> 6:      a     6      a     6
#> 7:      c     1      c     1
#> 8:      c     3      c     3
#> 9:      c     6      c     6
X[, DT[.BY, y, on="x"], by=x]           # join within each group
#>         x    V1
#>    <char> <num>
#> 1:      c     1
#> 2:      c     3
#> 3:      c     6
#> 4:      b     1
#> 5:      b     3
#> 6:      b     6
DT[, {
  # write each group to a different file
  fwrite(.SD, file.path(tempdir(), paste0('x=', .BY$x, '.csv')))
}, by=x]
#> Empty data.table (0 rows and 1 cols): x
dir(tempdir())
#> [1] "bslib-c167c22974dd79c09b3195ad9df50566"
#> [2] "downlit"                               
#> [3] "file5fe710e4f9"                        
#> [4] "x=a.csv"                               
#> [5] "x=b.csv"                               
#> [6] "x=c.csv"                               

# add/update/delete by reference (see ?assign)
print(DT[, z:=42L])                   # add new column by reference
#> Index: <x>
#>         x     y     v   grp     z
#>    <char> <num> <int> <int> <int>
#> 1:      b     1     1     1    42
#> 2:      b     3     2     1    42
#> 3:      b     6     3     1    42
#> 4:      a     1     4     2    42
#> 5:      a     3     5     2    42
#> 6:      a     6     6     2    42
#> 7:      c     1     7     3    42
#> 8:      c     3     8     3    42
#> 9:      c     6     9     3    42
print(DT[, z:=NULL])                  # remove column by reference
#> Index: <x>
#>         x     y     v   grp
#>    <char> <num> <int> <int>
#> 1:      b     1     1     1
#> 2:      b     3     2     1
#> 3:      b     6     3     1
#> 4:      a     1     4     2
#> 5:      a     3     5     2
#> 6:      a     6     6     2
#> 7:      c     1     7     3
#> 8:      c     3     8     3
#> 9:      c     6     9     3
print(DT["a", v:=42L, on="x"])        # subassign to existing v column by reference
#> Index: <x>
#>         x     y     v   grp
#>    <char> <num> <int> <int>
#> 1:      b     1     1     1
#> 2:      b     3     2     1
#> 3:      b     6     3     1
#> 4:      a     1    42     2
#> 5:      a     3    42     2
#> 6:      a     6    42     2
#> 7:      c     1     7     3
#> 8:      c     3     8     3
#> 9:      c     6     9     3
print(DT["b", v2:=84L, on="x"])       # subassign to new column by reference (NA padded)
#> Index: <x>
#>         x     y     v   grp    v2
#>    <char> <num> <int> <int> <int>
#> 1:      b     1     1     1    84
#> 2:      b     3     2     1    84
#> 3:      b     6     3     1    84
#> 4:      a     1    42     2    NA
#> 5:      a     3    42     2    NA
#> 6:      a     6    42     2    NA
#> 7:      c     1     7     3    NA
#> 8:      c     3     8     3    NA
#> 9:      c     6     9     3    NA

DT[, m:=mean(v), by=x][]              # add new column by reference by group
#> Index: <x>
#>         x     y     v   grp    v2     m
#>    <char> <num> <int> <int> <int> <num>
#> 1:      b     1     1     1    84     2
#> 2:      b     3     2     1    84     2
#> 3:      b     6     3     1    84     2
#> 4:      a     1    42     2    NA    42
#> 5:      a     3    42     2    NA    42
#> 6:      a     6    42     2    NA    42
#> 7:      c     1     7     3    NA     8
#> 8:      c     3     8     3    NA     8
#> 9:      c     6     9     3    NA     8
                                      # NB: postfix [] is shortcut to print()
# advanced usage
DT = data.table(x=rep(c("b","a","c"),each=3), v=c(1,1,1,2,2,1,1,2,2), y=c(1,3,6), a=1:9, b=9:1)

DT[, sum(v), by=.(y%%2)]              # expressions in by
#>        y    V1
#>    <num> <num>
#> 1:     1     9
#> 2:     0     4
DT[, sum(v), by=.(bool = y%%2)]       # same, using a named list to change by column name
#>     bool    V1
#>    <num> <num>
#> 1:     1     9
#> 2:     0     4
DT[, .SD[2], by=x]                    # get 2nd row of each group
#>         x     v     y     a     b
#>    <char> <num> <num> <int> <int>
#> 1:      b     1     3     2     8
#> 2:      a     2     3     5     5
#> 3:      c     2     3     8     2
DT[, tail(.SD,2), by=x]               # last 2 rows of each group
#>         x     v     y     a     b
#>    <char> <num> <num> <int> <int>
#> 1:      b     1     3     2     8
#> 2:      b     1     6     3     7
#> 3:      a     2     3     5     5
#> 4:      a     1     6     6     4
#> 5:      c     2     3     8     2
#> 6:      c     2     6     9     1
DT[, lapply(.SD, sum), by=x]          # sum of all (other) columns for each group
#>         x     v     y     a     b
#>    <char> <num> <num> <int> <int>
#> 1:      b     3    10     6    24
#> 2:      a     5    10    15    15
#> 3:      c     5    10    24     6
DT[, .SD[which.min(v)], by=x]         # nested query by group
#>         x     v     y     a     b
#>    <char> <num> <num> <int> <int>
#> 1:      b     1     1     1     9
#> 2:      a     1     6     6     4
#> 3:      c     1     1     7     3

DT[, list(MySum=sum(v),
          MyMin=min(v),
          MyMax=max(v)),
    by=.(x, y%%2)]                    # by 2 expressions
#>         x     y MySum MyMin MyMax
#>    <char> <num> <num> <num> <num>
#> 1:      b     1     2     1     1
#> 2:      b     0     1     1     1
#> 3:      a     1     4     2     2
#> 4:      a     0     1     1     1
#> 5:      c     1     3     1     2
#> 6:      c     0     2     2     2

DT[, .(a = .(a), b = .(b)), by=x]     # list columns
#>         x      a      b
#>    <char> <list> <list>
#> 1:      b  1,2,3  9,8,7
#> 2:      a  4,5,6  6,5,4
#> 3:      c  7,8,9  3,2,1
DT[, .(seq = min(a):max(b)), by=x]    # j is not limited to just aggregations
#>          x   seq
#>     <char> <int>
#>  1:      b     1
#>  2:      b     2
#>  3:      b     3
#>  4:      b     4
#>  5:      b     5
#>  6:      b     6
#>  7:      b     7
#>  8:      b     8
#>  9:      b     9
#> 10:      a     4
#> 11:      a     5
#> 12:      a     6
#> 13:      c     7
#> 14:      c     6
#> 15:      c     5
#> 16:      c     4
#> 17:      c     3
DT[, sum(v), by=x][V1<20]             # compound query
#>         x    V1
#>    <char> <num>
#> 1:      b     3
#> 2:      a     5
#> 3:      c     5
DT[, sum(v), by=x][order(-V1)]        # ordering results
#>         x    V1
#>    <char> <num>
#> 1:      a     5
#> 2:      c     5
#> 3:      b     3
DT[, c(.N, lapply(.SD,sum)), by=x]    # get number of observations and sum per group
#>         x     N     v     y     a     b
#>    <char> <int> <num> <num> <int> <int>
#> 1:      b     3     3    10     6    24
#> 2:      a     3     5    10    15    15
#> 3:      c     3     5    10    24     6
DT[, {tmp <- mean(y);
      .(a = a-tmp, b = b-tmp)
      }, by=x]                        # anonymous lambda in 'j', j accepts any valid
#>         x          a          b
#>    <char>      <num>      <num>
#> 1:      b -2.3333333  5.6666667
#> 2:      b -1.3333333  4.6666667
#> 3:      b -0.3333333  3.6666667
#> 4:      a  0.6666667  2.6666667
#> 5:      a  1.6666667  1.6666667
#> 6:      a  2.6666667  0.6666667
#> 7:      c  3.6666667 -0.3333333
#> 8:      c  4.6666667 -1.3333333
#> 9:      c  5.6666667 -2.3333333
                                      # expression. TO REMEMBER: every element of
                                      # the list becomes a column in result.
pdf("new.pdf")
DT[, plot(a,b), by=x]                 # can also plot in 'j'
#> Empty data.table (0 rows and 1 cols): x
dev.off()
#> pdf 
#>   2 

# using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v'
DT[, c(.(y=max(y)), lapply(.SD, min)), by=rleid(v), .SDcols=v:b]
#>    rleid     y     v     y     a     b
#>    <int> <num> <num> <num> <int> <int>
#> 1:     1     6     1     1     1     7
#> 2:     2     3     2     1     4     5
#> 3:     3     6     1     1     6     3
#> 4:     4     6     2     3     8     1

# Support guide and links:
# https://github.com/Rdatatable/data.table/wiki/Support

if (FALSE) { # \dontrun{
if (interactive()) {
  vignette(package="data.table")  # 9 vignettes

  test.data.table()               # 6,000 tests

  # keep up to date with latest stable version on CRAN
  update.packages()

  # get the latest devel version that has passed all tests
  update_dev_pkg()
  # read more at:
  # https://github.com/Rdatatable/data.table/wiki/Installation
}
} # }