ECON 413
Data manipulation with data.table
Erol Taymaz
Department of Economics
Middle East Technical University
Why data.table?
fread(“somedata.csv”)
data.table(x = 1:10)
as.data.table(df)
setDT(df)
Data table operations operate within the object
[ is a function!
dt[i, j, by]
i filter or arrange, operates on rows (observations)
j change/create, operates on columns (variables)
by group-by, groups observations
library(data.table)
set.seed(1234)
nstudent <- 100
DT <- data.table(id = 1:nstudent,
class = c(1:4),
gender = sample(c("M", "F"), size = 100, replace = TRUE),
grade = 100*runif(nstudent))
DT[class == 1,
mean(grade, na.rm = T),
by = gender]
## gender V1
## 1: F 50.76277
## 2: M 52.89040
set.seed(1234)
DF <- data.frame(id = 1:nstudent,
class = c(1:4),
gender = sample(c("M", "F"), size = 100, replace = TRUE),
grade = 100*runif(nstudent))
aggregate(DF[DF$class == 1, ]$grade, by = list(DF[DF$class == 1, ]$gender), mean)
## Group.1 x
## 1 F 50.76277
## 2 M 52.89040
## id class gender grade
## 1: 1 1 F 3.545673
## 2: 2 2 F 56.507611
## 3: 3 3 F 28.025778
## 4: 4 4 F 20.419632
## 5: 5 1 M 13.373890
## id class gender grade
## 1: 1 1 F 3.545673
## 2: 2 2 F 56.507611
## 3: 3 3 F 28.025778
## 4: 4 4 F 20.419632
## 5: 5 1 M 13.373890
## id class gender grade
## 1: 1 1 F 3.545673
## 2: 3 3 F 28.025778
## 3: 5 1 M 13.373890
## id class gender grade
## 1 1 1 F 3.545673
## 2 2 2 F 56.507611
## 3 3 3 F 28.025778
## 4 4 4 F 20.419632
## 5 5 1 M 13.373890
# Logical operator
DTM <- DT[gender == "M"]
DTI <- DT[grade >= 90]
# Multiple filters/conditions
DT[gender == "M" & grade > 80]
## id class gender grade
## 1: 31 3 M 96.22228
## 2: 35 3 M 91.43169
## 3: 37 1 M 90.81691
## 4: 40 4 M 86.91583
## 5: 42 2 M 98.36351
## 6: 49 1 M 98.07865
## 7: 58 2 M 99.87408
## 8: 69 1 M 80.24020
## 9: 85 1 M 95.59926
## 10: 87 3 M 84.17106
## 11: 92 4 M 99.25160
## 12: 95 3 M 81.00834
## id class gender grade
## 1: 86 2 F 2.220682
## 2: 1 1 F 3.545673
## 3: 10 2 F 3.864265
## 4: 68 4 M 7.271609
## 5: 84 4 F 7.608332
## id class gender grade
## 86 86 2 F 2.220682
## 1 1 1 F 3.545673
## 10 10 2 F 3.864265
## 68 68 4 M 7.271609
## 84 84 4 F 7.608332
## id class gender grade
## 1: 58 2 M 99.87408
## 2: 23 3 F 99.45982
## 3: 92 4 M 99.25160
## 4: 56 4 F 98.81674
## 5: 42 2 M 98.36351
## id class gender grade
## 1: 49 1 M 98.07865
## 2: 85 1 M 95.59926
## 3: 13 1 F 95.03049
## 4: 17 1 F 94.58531
## 5: 37 1 M 90.81691
## id class gender grade
## 1: 49 1 M 98.07865
## 2: 85 1 M 95.59926
## 3: 13 1 F 95.03049
## 4: 17 1 F 94.58531
## 5: 37 1 M 90.81691
Two types of operations
Same dataset Add, delete, or change columns - no change in the number of observations
New dataset “Aggregate” - reduce the number of observations
# Create a new variable
DT[, loggrade := log(grade)]
DT[, ngender := ifelse(gender == "M", 0, 1)]
DF$loggrade <- log(DF$grade)
DF$ngender <- ifelse(DF$gender == "M", 0, 1)
# Change an existing variable
DT[, grade := grade / 100]
# These changes are made by reference,
# i.e., in place. So we do not have to assign
# the object to save these changes.
# Modification by reference
names(DT)
## [1] "id" "class" "gender" "grade" "loggrade" "ngender"
## [1] "id" "class" "gender" "grade" "loggrade" "ngender"
## [1] "id" "class" "gender" "grade" "loggrade"
## [1] "id" "class" "gender" "grade" "loggrade"
# If you want to duplicate the data for some reason,
# use the __copy_ function
DT_copy <- copy(DT)
DT_copy[, gender := NULL]
names(DT_copy)
## [1] "id" "class" "grade" "loggrade"
## [1] "id" "class" "gender" "grade" "loggrade"
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## NA NA NA NaN NA NA 100
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 1 1 1 1 1 58
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
DT[, c("honor", "hhonor") :=
list(ifelse(grade >=80 & grade < 90, 1, 0),
ifelse(grade >=90, 1, 0))]
summary(DT[, list(honor, hhonor)])
## honor hhonor
## Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0
## Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
## honor hhonor
## Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0
## Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
## id class
## 1: 49 1
## 2: 85 1
## 3: 13 1
## 4: 17 1
## 5: 37 1
## id class
## 1: 49 1
## 2: 85 1
## 3: 13 1
## 4: 17 1
## 5: 37 1
## id class
## 1: 49 1
## 2: 85 1
## 3: 13 1
## 4: 17 1
## 5: 37 1
## id class
## 1: 49 1
## 2: 85 1
## 3: 13 1
## 4: 17 1
## 5: 37 1
## gender grade loggrade pass honor hhonor error sqerr
## 1: M 0.9807865 4.585770 0 0 0 0.41452353 0.171829761
## 2: M 0.9559926 4.560165 0 0 0 -0.47471847 0.225357629
## 3: F 0.9503049 4.554198 0 0 0 0.06599349 0.004355141
## 4: F 0.9458531 4.549502 0 0 0 -0.50247778 0.252483922
## 5: M 0.9081691 4.508846 0 0 0 -0.82599859 0.682273666
## [1] 49 85 13 17 37
## [1] 49 85 13 17 37
Operating on columns (new dataset, reduced number of rows)
## [1] 0.5290748
## [1] 52.90748
## [1] 0.5290748
## xgrade
## 1: 0.9987408
## agrade xgrade
## 1: 0.5290748 0.9987408
## id level gender grade loggrade pass honor hhonor error sqerr
## 1: 49 1 M 0.9807865 4.585770 0 0 0 0.41452353 0.171829761
## 2: 85 1 M 0.9559926 4.560165 0 0 0 -0.47471847 0.225357629
## 3: 13 1 F 0.9503049 4.554198 0 0 0 0.06599349 0.004355141
## 4: 17 1 F 0.9458531 4.549502 0 0 0 -0.50247778 0.252483922
## 5: 37 1 M 0.9081691 4.508846 0 0 0 -0.82599859 0.682273666
## xgrade
## 1: 0.9987408
## 2: 0.9987408
## 3: 0.9987408
## 4: 0.9987408
## 5: 0.9987408
## level V1
## 1: 1 0.5178403
## 2: 2 0.5263149
## 3: 3 0.5326580
## 4: 4 0.5394860
## level gender V1
## 1: 1 M 0.5289040
## 2: 1 F 0.5076277
## 3: 2 M 0.5940307
## 4: 2 F 0.4731096
## 5: 3 F 0.4809367
## 6: 3 M 0.5984851
## 7: 4 M 0.5090924
## 8: 4 F 0.5537889
## class gender grade
## 1 1 F 50.76277
## 2 2 F 47.31096
## 3 3 F 48.09367
## 4 4 F 55.37889
## 5 1 M 52.89040
## 6 2 M 59.40307
## 7 3 M 59.84851
## 8 4 M 50.90924
## level agrade xgrade
## 1: 1 0.5178403 0.9807865
## 2: 2 0.5263149 0.9987408
## 3: 3 0.5326580 0.9945982
## 4: 4 0.5394860 0.9925160
## level grade >= 70 atgrade xtgrade
## 1: 1 FALSE 0.5178403 0.9807865
## 2: 2 FALSE 0.5263149 0.9987408
## 3: 3 FALSE 0.5326580 0.9945982
## 4: 4 FALSE 0.5394860 0.9925160
## level agrade
## 1: 1 0.5178403
## 2: 2 0.5263149
## 3: 3 0.5326580
## 4: 4 0.5394860
## id level gender grade loggrade pass honor hhonor error sqerr
## 1: 49 1 M 0.9807865 4.585770 0 0 0 0.41452353 0.171829761
## 2: 85 1 M 0.9559926 4.560165 0 0 0 -0.47471847 0.225357629
## 3: 13 1 F 0.9503049 4.554198 0 0 0 0.06599349 0.004355141
## 4: 17 1 F 0.9458531 4.549502 0 0 0 -0.50247778 0.252483922
## 5: 37 1 M 0.9081691 4.508846 0 0 0 -0.82599859 0.682273666
## xgrade agrade dgrade
## 1: 0.9987408 0.5178403 0.4629462
## 2: 0.9987408 0.5178403 0.4381523
## 3: 0.9987408 0.5178403 0.4324646
## 4: 0.9987408 0.5178403 0.4280128
## 5: 0.9987408 0.5178403 0.3903289
## level V1 V2 V3
## 1: 1 0.5178403 0.05593210 0.6493600
## 2: 2 0.5263149 -0.01597593 0.8066647
## 3: 3 0.5326580 0.10037441 0.9004802
## 4: 4 0.5394860 0.02464213 1.8693256
## level grade error sqerr
## 1: 1 0.5178403 0.05593210 0.6493600
## 2: 2 0.5263149 -0.01597593 0.8066647
## 3: 3 0.5326580 0.10037441 0.9004802
## 4: 4 0.5394860 0.02464213 1.8693256
## level grade error sqerr
## 1: 1 0.5178403 0.05593210 0.6493600
## 2: 2 0.5263149 -0.01597593 0.8066647
## 3: 3 0.5326580 0.10037441 0.9004802
## 4: 4 0.5394860 0.02464213 1.8693256
## level id level gender
## 1: 1 integer integer character
## 2: 2 integer integer character
## 3: 3 integer integer character
## 4: 4 integer integer character
# .SD is useful especially if names are not known
nvar <- names(DT)[is.numeric(DT)]
DT[, lapply(.SD, mean), by = level, .SDcols = nvar]
## Empty data.table (0 rows and 1 cols): level
## Empty data.table (0 rows and 1 cols): level
key is used for very fast subsetting The dataset is ordered by key
## [1] "level" "gender"
## level gender V1
## 1: 1 F 0.5076277
## 2: 1 M 0.5289040
## 3: 2 F 0.4731096
## 4: 2 M 0.5940307
## 5: 3 F 0.4809367
## 6: 3 M 0.5984851
## 7: 4 F 0.5537889
## 8: 4 M 0.5090924
library(microbenchmark)
microbenchmark(DT[level == 2 & gender == "M", mean(grade)], times = 1000, unit = "ms")
## Unit: milliseconds
## expr min lq mean median
## DT[level == 2 & gender == "M", mean(grade)] 1.16845 1.344455 1.541725 1.415278
## uq max neval
## 1.612858 5.916052 1000
## Unit: milliseconds
## expr min lq mean median uq
## DT[.(2, "M"), mean(grade)] 0.577323 0.646441 0.7303203 0.681396 0.732868
## max neval
## 3.820128 1000
## Unit: milliseconds
## expr min lq mean
## DT[level %in% c(2:3) & gender == "M", mean(grade)] 1.047106 1.204598 1.343809
## median uq max neval
## 1.268401 1.361413 9.407477 1000
## Unit: milliseconds
## expr min lq mean median uq
## DT[.(2:3, "M"), mean(grade)] 0.524991 0.6233255 0.6939423 0.6461195 0.704652
## max neval
## 5.149863 1000
set.seed(1234)
nstudent <- 20
DT1 <- data.table(id = 1:nstudent,
class = c(1:4),
gender = c("M", "F"),
grade = 100*runif(nstudent))
DT11 <- data.table(id = 1:nstudent,
class = c(1:4),
hs = rnorm(nstudent))
DT2 <- data.table(class = c(1:3),
inst = c("A", "B", "C"),
gender = c("M", "F", "F"))
# Left join
DTM <- DT1[DT2, on = "class"]
DTM
## id class gender grade inst i.gender
## 1: 1 1 M 11.3703411 A M
## 2: 5 1 M 86.0915384 A M
## 3: 9 1 M 66.6083758 A M
## 4: 13 1 M 28.2733584 A M
## 5: 17 1 M 28.6223285 A M
## 6: 2 2 F 62.2299405 B F
## 7: 6 2 F 64.0310605 B F
## 8: 10 2 F 51.4251141 B F
## 9: 14 2 F 92.3433484 B F
## 10: 18 2 F 26.6820780 B F
## 11: 3 3 M 60.9274733 C F
## 12: 7 3 M 0.9495756 C F
## 13: 11 3 M 69.3591292 C F
## 14: 15 3 M 29.2315840 C F
## 15: 19 3 M 18.6722790 C F
## class inst gender id i.gender grade
## 1: 1 A M 1 M 11.3703411
## 2: 2 B F 2 F 62.2299405
## 3: 3 C F 3 M 60.9274733
## 4: 4 <NA> <NA> 4 F 62.3379442
## 5: 1 A M 5 M 86.0915384
## 6: 2 B F 6 F 64.0310605
## 7: 3 C F 7 M 0.9495756
## 8: 4 <NA> <NA> 8 F 23.2550506
## 9: 1 A M 9 M 66.6083758
## 10: 2 B F 10 F 51.4251141
## 11: 3 C F 11 M 69.3591292
## 12: 4 <NA> <NA> 12 F 54.4974836
## 13: 1 A M 13 M 28.2733584
## 14: 2 B F 14 F 92.3433484
## 15: 3 C F 15 M 29.2315840
## 16: 4 <NA> <NA> 16 F 83.7295628
## 17: 1 A M 17 M 28.6223285
## 18: 2 B F 18 F 26.6820780
## 19: 3 C F 19 M 18.6722790
## 20: 4 <NA> <NA> 20 F 23.2225911
# Wide format
set.seed(1234)
DTW <- data.table(country = c("Turkey", "Germany", "Korea", "US"),
Y2000 = rnorm(4), Y2001 = rnorm(4), Y2002 = rnorm(4), Y2003 = rnorm(4), Y2004 = rnorm(4))
DTW
## country Y2000 Y2001 Y2002 Y2003 Y2004
## 1: Turkey -1.2070657 0.4291247 -0.5644520 -0.77625389 -0.5110095
## 2: Germany 0.2774292 0.5060559 -0.8900378 0.06445882 -0.9111954
## 3: Korea 1.0844412 -0.5747400 -0.4771927 0.95949406 -0.8371717
## 4: US -2.3456977 -0.5466319 -0.9983864 -0.11028549 2.4158352
# Wide-to-long format, melt function
DTL <- melt(DTW, id.vars = "country", variable.name = "year", value.name = "gdpgr")
DTL
## country year gdpgr
## 1: Turkey Y2000 -1.20706575
## 2: Germany Y2000 0.27742924
## 3: Korea Y2000 1.08444118
## 4: US Y2000 -2.34569770
## 5: Turkey Y2001 0.42912469
## 6: Germany Y2001 0.50605589
## 7: Korea Y2001 -0.57473996
## 8: US Y2001 -0.54663186
## 9: Turkey Y2002 -0.56445200
## 10: Germany Y2002 -0.89003783
## 11: Korea Y2002 -0.47719270
## 12: US Y2002 -0.99838644
## 13: Turkey Y2003 -0.77625389
## 14: Germany Y2003 0.06445882
## 15: Korea Y2003 0.95949406
## 16: US Y2003 -0.11028549
## 17: Turkey Y2004 -0.51100951
## 18: Germany Y2004 -0.91119542
## 19: Korea Y2004 -0.83717168
## 20: US Y2004 2.41583518
## country year gdpgr
## 1: Turkey 2000 -1.20706575
## 2: Germany 2000 0.27742924
## 3: Korea 2000 1.08444118
## 4: US 2000 -2.34569770
## 5: Turkey 2001 0.42912469
## 6: Germany 2001 0.50605589
## 7: Korea 2001 -0.57473996
## 8: US 2001 -0.54663186
## 9: Turkey 2002 -0.56445200
## 10: Germany 2002 -0.89003783
## 11: Korea 2002 -0.47719270
## 12: US 2002 -0.99838644
## 13: Turkey 2003 -0.77625389
## 14: Germany 2003 0.06445882
## 15: Korea 2003 0.95949406
## 16: US 2003 -0.11028549
## 17: Turkey 2004 -0.51100951
## 18: Germany 2004 -0.91119542
## 19: Korea 2004 -0.83717168
## 20: US 2004 2.41583518
## Classes 'data.table' and 'data.frame': 20 obs. of 3 variables:
## $ country: chr "Turkey" "Germany" "Korea" "US" ...
## $ year : int 2000 2000 2000 2000 2001 2001 2001 2001 2002 2002 ...
## $ gdpgr : num -1.207 0.277 1.084 -2.346 0.429 ...
## - attr(*, ".internal.selfref")=<externalptr>
## country 2000 2001 2002 2003 2004
## 1: Germany 0.2774292 0.5060559 -0.8900378 0.06445882 -0.9111954
## 2: Korea 1.0844412 -0.5747400 -0.4771927 0.95949406 -0.8371717
## 3: Turkey -1.2070657 0.4291247 -0.5644520 -0.77625389 -0.5110095
## 4: US -2.3456977 -0.5466319 -0.9983864 -0.11028549 2.4158352
## Classes 'data.table' and 'data.frame': 4 obs. of 6 variables:
## $ country: chr "Germany" "Korea" "Turkey" "US"
## $ 2000 : num 0.277 1.084 -1.207 -2.346
## $ 2001 : num 0.506 -0.575 0.429 -0.547
## $ 2002 : num -0.89 -0.477 -0.564 -0.998
## $ 2003 : num 0.0645 0.9595 -0.7763 -0.1103
## $ 2004 : num -0.911 -0.837 -0.511 2.416
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "country"
# Rename variables
old_names <- names(DTW2)[2:6]
new_names <- paste0("Y", old_names)
setnames(DTW2, old_names, new_names)
DTW2
## country Y2000 Y2001 Y2002 Y2003 Y2004
## 1: Germany 0.2774292 0.5060559 -0.8900378 0.06445882 -0.9111954
## 2: Korea 1.0844412 -0.5747400 -0.4771927 0.95949406 -0.8371717
## 3: Turkey -1.2070657 0.4291247 -0.5644520 -0.77625389 -0.5110095
## 4: US -2.3456977 -0.5466319 -0.9983864 -0.11028549 2.4158352
.I row number .N counter .GRP group number
## [1] 1
## level gender N
## 1: 1 F 13
## 2: 1 M 12
## 3: 2 F 14
## 4: 2 M 11
## 5: 3 F 14
## 6: 3 M 11
## 7: 4 F 17
## 8: 4 M 8
## id level gender grade loggrade pass honor hhonor error sqerr
## 1: 13 1 F 0.9503049 4.554198 0 0 0 0.06599349 0.004355141
## 2: 17 1 F 0.9458531 4.549502 0 0 0 -0.50247778 0.252483922
## 3: 97 1 F 0.8347027 4.424490 0 0 0 0.16698928 0.027885420
## 4: 21 1 F 0.7442772 4.309828 0 0 0 0.16818539 0.028286325
## 5: 41 1 F 0.5027498 3.917508 0 0 0 -0.19593462 0.038390375
## 6: 29 1 F 0.4969662 3.905937 0 0 0 -0.64906975 0.421291542
## xgrade agrade dgrade group_id
## 1: 0.9987408 0.5178403 0.43246464 1
## 2: 0.9987408 0.5178403 0.42801283 1
## 3: 0.9987408 0.5178403 0.31686237 1
## 4: 0.9987408 0.5178403 0.22643692 1
## 5: 0.9987408 0.5178403 -0.01509047 1
## 6: 0.9987408 0.5178403 -0.02087412 1
%in% %like% %between% shift
## [1] 0.5328196
## [1] 0.5328196
## [1] 0.5328196
## time gdp
## 1: 1 1.051341
## 2: 2 2.045093
## 3: 3 3.045595
## 4: 4 4.054596
## 5: 5 5.043063
## 6: 6 6.035518
## time gdp gdpgr
## 1: 1 1.051341 NA
## 2: 2 2.045093 0.6653770
## 3: 3 3.045595 0.3982528
## 4: 4 4.054596 0.2861549
## 5: 5 5.043063 0.2181626
## 6: 6 6.035518 0.1796481
data.table functions are very fast in reading and writing csv files
fread
fwrite