ECON 413
Data manipulation with data.table

Erol Taymaz
Department of Economics
Middle East Technical University

Data science process

Cleaning and transforming the data

Why?

Why data.table?

Create a data.table

fread(“somedata.csv”)

data.table(x = 1:10)

as.data.table(df)

setDT(df)

How it works?

Data table operations operate within the object

[ is a function!

dt[i, j, by]

i filter or arrange, operates on rows (observations)

j change/create, operates on columns (variables)

by group-by, groups observations

A simple example

library(data.table)

set.seed(1234)
nstudent <- 100
DT <- data.table(id = 1:nstudent,
                 class = c(1:4),
                 gender = sample(c("M", "F"), size = 100, replace = TRUE),
                 grade = 100*runif(nstudent))

DT[class == 1,
   mean(grade, na.rm = T),
   by = gender]
##    gender       V1
## 1:      F 50.76277
## 2:      M 52.89040
set.seed(1234)
DF <- data.frame(id = 1:nstudent,
                 class = c(1:4),
                 gender = sample(c("M", "F"), size = 100, replace = TRUE),
                 grade = 100*runif(nstudent))

aggregate(DF[DF$class == 1, ]$grade, by = list(DF[DF$class == 1, ]$gender), mean)
##   Group.1        x
## 1       F 50.76277
## 2       M 52.89040

Operating on rows (variables)

# Row number
DT[1:5]
##    id class gender     grade
## 1:  1     1      F  3.545673
## 2:  2     2      F 56.507611
## 3:  3     3      F 28.025778
## 4:  4     4      F 20.419632
## 5:  5     1      M 13.373890
DT[1:5,]
##    id class gender     grade
## 1:  1     1      F  3.545673
## 2:  2     2      F 56.507611
## 3:  3     3      F 28.025778
## 4:  4     4      F 20.419632
## 5:  5     1      M 13.373890
DT[c(1,3,5)]
##    id class gender     grade
## 1:  1     1      F  3.545673
## 2:  3     3      F 28.025778
## 3:  5     1      M 13.373890
DF[1:5, ]
##   id class gender     grade
## 1  1     1      F  3.545673
## 2  2     2      F 56.507611
## 3  3     3      F 28.025778
## 4  4     4      F 20.419632
## 5  5     1      M 13.373890
# Logical operator
DTM <- DT[gender == "M"]
DTI <- DT[grade >= 90]

# Multiple filters/conditions
DT[gender == "M" & grade > 80]
##     id class gender    grade
##  1: 31     3      M 96.22228
##  2: 35     3      M 91.43169
##  3: 37     1      M 90.81691
##  4: 40     4      M 86.91583
##  5: 42     2      M 98.36351
##  6: 49     1      M 98.07865
##  7: 58     2      M 99.87408
##  8: 69     1      M 80.24020
##  9: 85     1      M 95.59926
## 10: 87     3      M 84.17106
## 11: 92     4      M 99.25160
## 12: 95     3      M 81.00834
# Order by row
DT <- DT[order(grade)]
DT[1:5]
##    id class gender    grade
## 1: 86     2      F 2.220682
## 2:  1     1      F 3.545673
## 3: 10     2      F 3.864265
## 4: 68     4      M 7.271609
## 5: 84     4      F 7.608332
DF <- DF[order(DF$grade), ]
DF[1:5, ]
##    id class gender    grade
## 86 86     2      F 2.220682
## 1   1     1      F 3.545673
## 10 10     2      F 3.864265
## 68 68     4      M 7.271609
## 84 84     4      F 7.608332
DT <- DT[order(-grade)]
DT[1:5]
##    id class gender    grade
## 1: 58     2      M 99.87408
## 2: 23     3      F 99.45982
## 3: 92     4      M 99.25160
## 4: 56     4      F 98.81674
## 5: 42     2      M 98.36351
DT <- DT[order(class, -grade)]
DT[1:5]
##    id class gender    grade
## 1: 49     1      M 98.07865
## 2: 85     1      M 95.59926
## 3: 13     1      F 95.03049
## 4: 17     1      F 94.58531
## 5: 37     1      M 90.81691
setorder(DT, class, -grade, na.last = T)
DT[1:5]
##    id class gender    grade
## 1: 49     1      M 98.07865
## 2: 85     1      M 95.59926
## 3: 13     1      F 95.03049
## 4: 17     1      F 94.58531
## 5: 37     1      M 90.81691

Operating on columns (variables)

Two types of operations

Operating on columns (same dataset)

# Create a new variable
DT[, loggrade := log(grade)]
DT[, ngender := ifelse(gender == "M", 0, 1)]

DF$loggrade <- log(DF$grade)
DF$ngender <- ifelse(DF$gender == "M", 0, 1)

# Change an existing variable
DT[, grade := grade / 100]

# These changes are made by reference, 
# i.e., in place. So we do not have to assign 
# the object to save these changes.

# Modification by reference
names(DT)
## [1] "id"       "class"    "gender"   "grade"    "loggrade" "ngender"
DT_copy <- DT
names(DT_copy)
## [1] "id"       "class"    "gender"   "grade"    "loggrade" "ngender"
# Deleting a variable
DT_copy[, ngender := NULL]

names(DT_copy)
## [1] "id"       "class"    "gender"   "grade"    "loggrade"
names(DT)
## [1] "id"       "class"    "gender"   "grade"    "loggrade"
# If you want to duplicate the data for some reason, 
# use the __copy_ function

DT_copy <- copy(DT)
DT_copy[, gender := NULL]

names(DT_copy)
## [1] "id"       "class"    "grade"    "loggrade"
names(DT)
## [1] "id"       "class"    "gender"   "grade"    "loggrade"

Operating on a subset of columns

# Add a new dummy variable

DT[grade >= 60, pass := 1]
summary(DT$pass)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      NA      NA      NA     NaN      NA      NA     100
DF$pass[DF$grade >= 60] <- 1
summary(DF$pass)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       1       1       1       1       1       1      58
DT[, pass := NULL]
DT[, pass := ifelse(grade >= 60, 1, 0)]
summary(DT$pass)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0

Creating two variables

DT[, c("honor", "hhonor") :=
     list(ifelse(grade >=80 & grade < 90, 1, 0),
          ifelse(grade >=90, 1, 0))]

summary(DT[, list(honor, hhonor)])
##      honor       hhonor 
##  Min.   :0   Min.   :0  
##  1st Qu.:0   1st Qu.:0  
##  Median :0   Median :0  
##  Mean   :0   Mean   :0  
##  3rd Qu.:0   3rd Qu.:0  
##  Max.   :0   Max.   :0
summary(DT[, .(honor, hhonor)])
##      honor       hhonor 
##  Min.   :0   Min.   :0  
##  1st Qu.:0   1st Qu.:0  
##  Median :0   Median :0  
##  Mean   :0   Mean   :0  
##  3rd Qu.:0   3rd Qu.:0  
##  Max.   :0   Max.   :0
# Functional form
DT[, `:=`(honor = ifelse(grade >=80 & grade < 90, 1, 0),
          hhonor =
            ifelse(grade >=90, 1, 0))]

# Or, two more lines
DT[, honor := ifelse(grade >=80 & grade < 90, 1, 0)]
DT[, hhonor := ifelse(grade >=90, 1, 0)]

Chaining data.table operations

DT[, error := rnorm(100)][, sqerr := error^2]

Subsetting on columns

DT[1:5, c(1:2)]
##    id class
## 1: 49     1
## 2: 85     1
## 3: 13     1
## 4: 17     1
## 5: 37     1
DT[1:5, list(id, class)]
##    id class
## 1: 49     1
## 2: 85     1
## 3: 13     1
## 4: 17     1
## 5: 37     1
DT[1:5, .(id, class)]
##    id class
## 1: 49     1
## 2: 85     1
## 3: 13     1
## 4: 17     1
## 5: 37     1
DT[1:5, c("id", "class")]
##    id class
## 1: 49     1
## 2: 85     1
## 3: 13     1
## 4: 17     1
## 5: 37     1
DT[1:5, !c("id", "class")]
##    gender     grade loggrade pass honor hhonor       error       sqerr
## 1:      M 0.9807865 4.585770    0     0      0  0.41452353 0.171829761
## 2:      M 0.9559926 4.560165    0     0      0 -0.47471847 0.225357629
## 3:      F 0.9503049 4.554198    0     0      0  0.06599349 0.004355141
## 4:      F 0.9458531 4.549502    0     0      0 -0.50247778 0.252483922
## 5:      M 0.9081691 4.508846    0     0      0 -0.82599859 0.682273666
DT[1:5, id]
## [1] 49 85 13 17 37
DT[1:5]$id
## [1] 49 85 13 17 37

Renaming variables

setnames(DT, "class", "level")

Aggregating

Operating on columns (new dataset, reduced number of rows)

DT[, mean(grade, na.rm =T)]
## [1] 0.5290748
mean(DF$grade, na.rm = T)
## [1] 52.90748
dtm <- DT[, mean(grade, na.rm =T)]
dtm
## [1] 0.5290748
DT[, list(xgrade = max(grade, na.rm = T))]
##       xgrade
## 1: 0.9987408
DT[, list(agrade = mean(grade),
          xgrade = max(grade))]
##       agrade    xgrade
## 1: 0.5290748 0.9987408
DT[, xgrade := max(grade)]
DT[1:5]
##    id level gender     grade loggrade pass honor hhonor       error       sqerr
## 1: 49     1      M 0.9807865 4.585770    0     0      0  0.41452353 0.171829761
## 2: 85     1      M 0.9559926 4.560165    0     0      0 -0.47471847 0.225357629
## 3: 13     1      F 0.9503049 4.554198    0     0      0  0.06599349 0.004355141
## 4: 17     1      F 0.9458531 4.549502    0     0      0 -0.50247778 0.252483922
## 5: 37     1      M 0.9081691 4.508846    0     0      0 -0.82599859 0.682273666
##       xgrade
## 1: 0.9987408
## 2: 0.9987408
## 3: 0.9987408
## 4: 0.9987408
## 5: 0.9987408

Group by

DT[, mean(grade), by = level]
##    level        V1
## 1:     1 0.5178403
## 2:     2 0.5263149
## 3:     3 0.5326580
## 4:     4 0.5394860
DT[, mean(grade), by = .(level, gender)]
##    level gender        V1
## 1:     1      M 0.5289040
## 2:     1      F 0.5076277
## 3:     2      M 0.5940307
## 4:     2      F 0.4731096
## 5:     3      F 0.4809367
## 6:     3      M 0.5984851
## 7:     4      M 0.5090924
## 8:     4      F 0.5537889
aggregate(grade ~ class + gender, data = DF, mean)
##   class gender    grade
## 1     1      F 50.76277
## 2     2      F 47.31096
## 3     3      F 48.09367
## 4     4      F 55.37889
## 5     1      M 52.89040
## 6     2      M 59.40307
## 7     3      M 59.84851
## 8     4      M 50.90924
DT[, .(agrade = mean(grade), 
       xgrade = max(grade)), by = level]
##    level    agrade    xgrade
## 1:     1 0.5178403 0.9807865
## 2:     2 0.5263149 0.9987408
## 3:     3 0.5326580 0.9945982
## 4:     4 0.5394860 0.9925160
DT[, .(atgrade = mean(grade), 
       xtgrade = max(grade)), by = .(level, grade >= 70)]
##    level grade >= 70   atgrade   xtgrade
## 1:     1       FALSE 0.5178403 0.9807865
## 2:     2       FALSE 0.5263149 0.9987408
## 3:     3       FALSE 0.5326580 0.9945982
## 4:     4       FALSE 0.5394860 0.9925160
# What happens in the following cases
DT[, .(agrade = mean(grade)), by = level]
##    level    agrade
## 1:     1 0.5178403
## 2:     2 0.5263149
## 3:     3 0.5326580
## 4:     4 0.5394860
DT[, agrade := mean(grade), by = level]

DT[, dgrade := grade - mean(grade), by = level]

DT[1:5]
##    id level gender     grade loggrade pass honor hhonor       error       sqerr
## 1: 49     1      M 0.9807865 4.585770    0     0      0  0.41452353 0.171829761
## 2: 85     1      M 0.9559926 4.560165    0     0      0 -0.47471847 0.225357629
## 3: 13     1      F 0.9503049 4.554198    0     0      0  0.06599349 0.004355141
## 4: 17     1      F 0.9458531 4.549502    0     0      0 -0.50247778 0.252483922
## 5: 37     1      M 0.9081691 4.508846    0     0      0 -0.82599859 0.682273666
##       xgrade    agrade    dgrade
## 1: 0.9987408 0.5178403 0.4629462
## 2: 0.9987408 0.5178403 0.4381523
## 3: 0.9987408 0.5178403 0.4324646
## 4: 0.9987408 0.5178403 0.4280128
## 5: 0.9987408 0.5178403 0.3903289

Subsetting columns with .SD

# Get the mean value of 3 variables
DT[, .(mean(grade), mean(error), mean(sqerr)), by = level]
##    level        V1          V2        V3
## 1:     1 0.5178403  0.05593210 0.6493600
## 2:     2 0.5263149 -0.01597593 0.8066647
## 3:     3 0.5326580  0.10037441 0.9004802
## 4:     4 0.5394860  0.02464213 1.8693256
DT[, lapply(.SD, mean), by = level, 
   .SDcols = c("grade", "error", "sqerr")]
##    level     grade       error     sqerr
## 1:     1 0.5178403  0.05593210 0.6493600
## 2:     2 0.5263149 -0.01597593 0.8066647
## 3:     3 0.5326580  0.10037441 0.9004802
## 4:     4 0.5394860  0.02464213 1.8693256
svar <- c("grade", "error", "sqerr")
DT[, lapply(.SD, mean), by = level, .SDcols = svar]
##    level     grade       error     sqerr
## 1:     1 0.5178403  0.05593210 0.6493600
## 2:     2 0.5263149 -0.01597593 0.8066647
## 3:     3 0.5326580  0.10037441 0.9004802
## 4:     4 0.5394860  0.02464213 1.8693256
DT[, lapply(.SD, class), by = level, .SDcols = id:gender]
##    level      id   level    gender
## 1:     1 integer integer character
## 2:     2 integer integer character
## 3:     3 integer integer character
## 4:     4 integer integer character
# .SD is useful especially if names are not known
nvar <- names(DT)[is.numeric(DT)]
DT[, lapply(.SD, mean), by = level, .SDcols = nvar]
## Empty data.table (0 rows and 1 cols): level
# keyby
DT[, lapply(.SD, mean), keyby = level, .SDcols = nvar]
## Empty data.table (0 rows and 1 cols): level

keys

key is used for very fast subsetting The dataset is ordered by key

setkey(DT, level)
setkey(DT,  level, gender)
# Orders the data

key(DT)
## [1] "level"  "gender"
DT[, mean(grade), by = .(level, gender)]
##    level gender        V1
## 1:     1      F 0.5076277
## 2:     1      M 0.5289040
## 3:     2      F 0.4731096
## 4:     2      M 0.5940307
## 5:     3      F 0.4809367
## 6:     3      M 0.5984851
## 7:     4      F 0.5537889
## 8:     4      M 0.5090924
library(microbenchmark)
microbenchmark(DT[level == 2 & gender == "M", mean(grade)], times = 1000, unit = "ms")
## Unit: milliseconds
##                                         expr     min       lq     mean   median
##  DT[level == 2 & gender == "M", mean(grade)] 1.16845 1.344455 1.541725 1.415278
##        uq      max neval
##  1.612858 5.916052  1000
microbenchmark(DT[.(2, "M"), mean(grade)], times = 1000, unit = "ms")
## Unit: milliseconds
##                        expr      min       lq      mean   median       uq
##  DT[.(2, "M"), mean(grade)] 0.577323 0.646441 0.7303203 0.681396 0.732868
##       max neval
##  3.820128  1000
microbenchmark(DT[level %in% c(2:3) & gender == "M", mean(grade)], times = 1000, unit = "ms")
## Unit: milliseconds
##                                                expr      min       lq     mean
##  DT[level %in% c(2:3) & gender == "M", mean(grade)] 1.047106 1.204598 1.343809
##    median       uq      max neval
##  1.268401 1.361413 9.407477  1000
microbenchmark(DT[.(2:3, "M"), mean(grade)], times = 1000, unit = "ms")
## Unit: milliseconds
##                          expr      min        lq      mean    median       uq
##  DT[.(2:3, "M"), mean(grade)] 0.524991 0.6233255 0.6939423 0.6461195 0.704652
##       max neval
##  5.149863  1000

Merging datasets

set.seed(1234)
nstudent <- 20
DT1 <- data.table(id = 1:nstudent,
                  class = c(1:4),
                  gender = c("M", "F"),
                  grade = 100*runif(nstudent))

DT11 <- data.table(id = 1:nstudent,
                   class = c(1:4),
                   hs = rnorm(nstudent))

DT2 <- data.table(class = c(1:3), 
                  inst = c("A", "B", "C"), 
                  gender = c("M", "F", "F"))

# Left join
DTM <- DT1[DT2, on = "class"]
DTM
##     id class gender      grade inst i.gender
##  1:  1     1      M 11.3703411    A        M
##  2:  5     1      M 86.0915384    A        M
##  3:  9     1      M 66.6083758    A        M
##  4: 13     1      M 28.2733584    A        M
##  5: 17     1      M 28.6223285    A        M
##  6:  2     2      F 62.2299405    B        F
##  7:  6     2      F 64.0310605    B        F
##  8: 10     2      F 51.4251141    B        F
##  9: 14     2      F 92.3433484    B        F
## 10: 18     2      F 26.6820780    B        F
## 11:  3     3      M 60.9274733    C        F
## 12:  7     3      M  0.9495756    C        F
## 13: 11     3      M 69.3591292    C        F
## 14: 15     3      M 29.2315840    C        F
## 15: 19     3      M 18.6722790    C        F
# Right join
DTM <- DT2[DT1, on = "class"]
DTM
##     class inst gender id i.gender      grade
##  1:     1    A      M  1        M 11.3703411
##  2:     2    B      F  2        F 62.2299405
##  3:     3    C      F  3        M 60.9274733
##  4:     4 <NA>   <NA>  4        F 62.3379442
##  5:     1    A      M  5        M 86.0915384
##  6:     2    B      F  6        F 64.0310605
##  7:     3    C      F  7        M  0.9495756
##  8:     4 <NA>   <NA>  8        F 23.2550506
##  9:     1    A      M  9        M 66.6083758
## 10:     2    B      F 10        F 51.4251141
## 11:     3    C      F 11        M 69.3591292
## 12:     4 <NA>   <NA> 12        F 54.4974836
## 13:     1    A      M 13        M 28.2733584
## 14:     2    B      F 14        F 92.3433484
## 15:     3    C      F 15        M 29.2315840
## 16:     4 <NA>   <NA> 16        F 83.7295628
## 17:     1    A      M 17        M 28.6223285
## 18:     2    B      F 18        F 26.6820780
## 19:     3    C      F 19        M 18.6722790
## 20:     4 <NA>   <NA> 20        F 23.2225911
# Fast join
setkey(DT1, class)
setkey(DT2, class)

DTM <- DT1[DT2]

# More than two variables
DTM <- DT1[DT11, on = .(id, class)]

# merge function
DTM <- merge(DT1, DT2, by = "class", all.x = TRUE)

Reshaping the data

# Wide format
set.seed(1234)
DTW <- data.table(country = c("Turkey", "Germany", "Korea", "US"),
                 Y2000 = rnorm(4), Y2001 = rnorm(4), Y2002 = rnorm(4), Y2003 = rnorm(4), Y2004 = rnorm(4))

DTW
##    country      Y2000      Y2001      Y2002       Y2003      Y2004
## 1:  Turkey -1.2070657  0.4291247 -0.5644520 -0.77625389 -0.5110095
## 2: Germany  0.2774292  0.5060559 -0.8900378  0.06445882 -0.9111954
## 3:   Korea  1.0844412 -0.5747400 -0.4771927  0.95949406 -0.8371717
## 4:      US -2.3456977 -0.5466319 -0.9983864 -0.11028549  2.4158352
# Wide-to-long format, melt function
DTL <- melt(DTW, id.vars = "country", variable.name = "year", value.name = "gdpgr")
DTL
##     country  year       gdpgr
##  1:  Turkey Y2000 -1.20706575
##  2: Germany Y2000  0.27742924
##  3:   Korea Y2000  1.08444118
##  4:      US Y2000 -2.34569770
##  5:  Turkey Y2001  0.42912469
##  6: Germany Y2001  0.50605589
##  7:   Korea Y2001 -0.57473996
##  8:      US Y2001 -0.54663186
##  9:  Turkey Y2002 -0.56445200
## 10: Germany Y2002 -0.89003783
## 11:   Korea Y2002 -0.47719270
## 12:      US Y2002 -0.99838644
## 13:  Turkey Y2003 -0.77625389
## 14: Germany Y2003  0.06445882
## 15:   Korea Y2003  0.95949406
## 16:      US Y2003 -0.11028549
## 17:  Turkey Y2004 -0.51100951
## 18: Germany Y2004 -0.91119542
## 19:   Korea Y2004 -0.83717168
## 20:      US Y2004  2.41583518
DTL[, year := as.integer(substr(year, 2, 5))]
DTL
##     country year       gdpgr
##  1:  Turkey 2000 -1.20706575
##  2: Germany 2000  0.27742924
##  3:   Korea 2000  1.08444118
##  4:      US 2000 -2.34569770
##  5:  Turkey 2001  0.42912469
##  6: Germany 2001  0.50605589
##  7:   Korea 2001 -0.57473996
##  8:      US 2001 -0.54663186
##  9:  Turkey 2002 -0.56445200
## 10: Germany 2002 -0.89003783
## 11:   Korea 2002 -0.47719270
## 12:      US 2002 -0.99838644
## 13:  Turkey 2003 -0.77625389
## 14: Germany 2003  0.06445882
## 15:   Korea 2003  0.95949406
## 16:      US 2003 -0.11028549
## 17:  Turkey 2004 -0.51100951
## 18: Germany 2004 -0.91119542
## 19:   Korea 2004 -0.83717168
## 20:      US 2004  2.41583518
str(DTL)
## Classes 'data.table' and 'data.frame':   20 obs. of  3 variables:
##  $ country: chr  "Turkey" "Germany" "Korea" "US" ...
##  $ year   : int  2000 2000 2000 2000 2001 2001 2001 2001 2002 2002 ...
##  $ gdpgr  : num  -1.207 0.277 1.084 -2.346 0.429 ...
##  - attr(*, ".internal.selfref")=<externalptr>
library(ggplot2)
ggplot(DTL, aes(x = year, y = gdpgr, color = country)) + geom_line() + theme_bw()

# Long-to-wide format, dcast function
DTW2 <- dcast(DTL, country ~ year, value.var = "gdpgr")
DTW2
##    country       2000       2001       2002        2003       2004
## 1: Germany  0.2774292  0.5060559 -0.8900378  0.06445882 -0.9111954
## 2:   Korea  1.0844412 -0.5747400 -0.4771927  0.95949406 -0.8371717
## 3:  Turkey -1.2070657  0.4291247 -0.5644520 -0.77625389 -0.5110095
## 4:      US -2.3456977 -0.5466319 -0.9983864 -0.11028549  2.4158352
str(DTW2)
## Classes 'data.table' and 'data.frame':   4 obs. of  6 variables:
##  $ country: chr  "Germany" "Korea" "Turkey" "US"
##  $ 2000   : num  0.277 1.084 -1.207 -2.346
##  $ 2001   : num  0.506 -0.575 0.429 -0.547
##  $ 2002   : num  -0.89 -0.477 -0.564 -0.998
##  $ 2003   : num  0.0645 0.9595 -0.7763 -0.1103
##  $ 2004   : num  -0.911 -0.837 -0.511 2.416
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "country"
# Rename variables
old_names <- names(DTW2)[2:6]
new_names <- paste0("Y", old_names)
setnames(DTW2, old_names, new_names)
DTW2
##    country      Y2000      Y2001      Y2002       Y2003      Y2004
## 1: Germany  0.2774292  0.5060559 -0.8900378  0.06445882 -0.9111954
## 2:   Korea  1.0844412 -0.5747400 -0.4771927  0.95949406 -0.8371717
## 3:  Turkey -1.2070657  0.4291247 -0.5644520 -0.77625389 -0.5110095
## 4:      US -2.3456977 -0.5466319 -0.9983864 -0.11028549  2.4158352

Special values

.I row number .N counter .GRP group number

# Row number of the highest grade
DT[grade == max(grade), .I]
## [1] 1
# Number of male and female students by class
DT[, .N, keyby = .(level, gender)]
##    level gender  N
## 1:     1      F 13
## 2:     1      M 12
## 3:     2      F 14
## 4:     2      M 11
## 5:     3      F 14
## 6:     3      M 11
## 7:     4      F 17
## 8:     4      M  8
# Create goup ID's
DT[, group_id := .GRP, by = .(level, gender)]
head(DT)
##    id level gender     grade loggrade pass honor hhonor       error       sqerr
## 1: 13     1      F 0.9503049 4.554198    0     0      0  0.06599349 0.004355141
## 2: 17     1      F 0.9458531 4.549502    0     0      0 -0.50247778 0.252483922
## 3: 97     1      F 0.8347027 4.424490    0     0      0  0.16698928 0.027885420
## 4: 21     1      F 0.7442772 4.309828    0     0      0  0.16818539 0.028286325
## 5: 41     1      F 0.5027498 3.917508    0     0      0 -0.19593462 0.038390375
## 6: 29     1      F 0.4969662 3.905937    0     0      0 -0.64906975 0.421291542
##       xgrade    agrade      dgrade group_id
## 1: 0.9987408 0.5178403  0.43246464        1
## 2: 0.9987408 0.5178403  0.42801283        1
## 3: 0.9987408 0.5178403  0.31686237        1
## 4: 0.9987408 0.5178403  0.22643692        1
## 5: 0.9987408 0.5178403 -0.01509047        1
## 6: 0.9987408 0.5178403 -0.02087412        1

Special functions

%in% %like% %between% shift

# Average grade for 2-4th classes
DT[level == 2 | level == 3 | level == 4, mean(grade)]
## [1] 0.5328196
DT[level %in% c(2:4), mean(grade)]
## [1] 0.5328196
DT[level %between% c(2, 4), mean(grade)]
## [1] 0.5328196
TS <- data.table(time = c(1:10))
TS[, gdp := time + rnorm(10, 0.05, 0.01)]
head(TS)
##    time      gdp
## 1:    1 1.051341
## 2:    2 2.045093
## 3:    3 3.045595
## 4:    4 4.054596
## 5:    5 5.043063
## 6:    6 6.035518
setkey(TS, time)
TS[, gdpgr := log(gdp/shift(gdp))]
head(TS)
##    time      gdp     gdpgr
## 1:    1 1.051341        NA
## 2:    2 2.045093 0.6653770
## 3:    3 3.045595 0.3982528
## 4:    4 4.054596 0.2861549
## 5:    5 5.043063 0.2181626
## 6:    6 6.035518 0.1796481

Reading and writing data

data.table functions are very fast in reading and writing csv files

fread

fwrite