ECON 413
Data types and data objects


Erol Taymaz
Department of Economics
Middle East Technical University

Topics

Use R for

Installing R

Using R

Using R

Objects

Using R

Using R

Using R

Using R

Using R

R console

Using RStudio

RStudio shortcuts

Installing R packages

install.packages(ggplot2)
update.packages()
update.packages(ggplot2)
library(ggplot2)   # Returns error
require(ggplot2)   # Returns warning

R as calculator

Key operators

R objects

Everything in R is an object

a <- c(1:5)
a
## [1] 1 2 3 4 5
sum(a)
## [1] 15
sum
## function (..., na.rm = FALSE)  .Primitive("sum")
a <- rnorm(100)
b <- a + rnorm(100)
model_1 <- lm(a ~ b)
model_1
## 
## Call:
## lm(formula = a ~ b)
## 
## Coefficients:
## (Intercept)            b  
##     0.06506      0.42404
str(model_1)
## List of 12
##  $ coefficients : Named num [1:2] 0.0651 0.424
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "b"
##  $ residuals    : Named num [1:100] 0.5182 1.0284 0.7242 0.0114 -0.062 ...
##   ..- attr(*, "names")= chr [1:100] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:100] 0.2518 -5.9784 0.6507 -0.082 -0.0843 ...
##   ..- attr(*, "names")= chr [1:100] "(Intercept)" "b" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:100] -0.00219 0.40228 0.14876 0.2787 -0.18484 ...
##   ..- attr(*, "names")= chr [1:100] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:100, 1:2] -10 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:100] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "b"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.1 1.07
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 98
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = a ~ b)
##  $ terms        :Classes 'terms', 'formula'  language a ~ b
##   .. ..- attr(*, "variables")= language list(a, b)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "a" "b"
##   .. .. .. ..$ : chr "b"
##   .. ..- attr(*, "term.labels")= chr "b"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(a, b)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "a" "b"
##  $ model        :'data.frame':   100 obs. of  2 variables:
##   ..$ a: num [1:100] 0.516 1.431 0.873 0.29 -0.247 ...
##   ..$ b: num [1:100] -0.159 0.795 0.197 0.504 -0.589 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language a ~ b
##   .. .. ..- attr(*, "variables")= language list(a, b)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "a" "b"
##   .. .. .. .. ..$ : chr "b"
##   .. .. ..- attr(*, "term.labels")= chr "b"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(a, b)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "a" "b"
##  - attr(*, "class")= chr "lm"
model_1$coefficients
## (Intercept)           b 
##  0.06506165  0.42403757
mean(model_1$residuals)
## [1] -2.488895e-17
summary(a)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.35313 -0.63566  0.05574 -0.02518  0.63855  1.58078
summary(model_1)
## 
## Call:
## lm(formula = a ~ b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.70619 -0.47993  0.00742  0.49945  1.31451 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.06506    0.06712   0.969    0.335    
## b            0.42404    0.04707   9.008 1.72e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6637 on 98 degrees of freedom
## Multiple R-squared:  0.453,  Adjusted R-squared:  0.4474 
## F-statistic: 81.14 on 1 and 98 DF,  p-value: 1.719e-14

``

Object class

All objects belong to one or more classes. There is no limit on the number of classes.

The class of an object defines how the object will be treated by functions.

A <- c(1:5)
class(A)
## [1] "integer"
B <- c("a", "b", "c")
class(B)
## [1] "character"
C <- c(T, F, T)
class(C)
## [1] "logical"
D <- c("a", "b", "c", 12, 24)
class(D)
## [1] "character"
E <- c(3, 5, T, F, T)
class(E)
## [1] "numeric"
F <- data.frame(a = c(1:5), b = rnorm(5), d = c("a", "b", "c", "d", "e"))
class(F)
## [1] "data.frame"
class(F$a)
## [1] "integer"
M <- matrix(c(1:15), nrow = 5, ncol = 3, byrow = TRUE)
class(M)
## [1] "matrix" "array"
M2 <- matrix(c(1:5, "A"), nrow = 3, ncol = 2, byrow = FALSE)
class(M2)
## [1] "matrix" "array"
rm(list=ls())

Special values

a <- c(1, 2)
a
## [1] 1 2
a[3] <- 3
a[4] <- NA
a[5] <- 1 / 0
a[6] <- -1 / 0
a[7] <- 0 / 0

a
## [1]    1    2    3   NA  Inf -Inf  NaN

Data objects

Vectors

c, rep, seq, sample and runif functions

a <- c(1, 2, 4)
b <- c(1L, 2L, 4L)
c <- c(TRUE, FALSE, T, F)
d <- c("This", "That")

class(a)
## [1] "numeric"
str(a)
##  num [1:3] 1 2 4
summary(a)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.500   2.000   2.333   3.000   4.000
a <- c(1:5)
b <- c(5:1)
ab <- c(a,b)
a
## [1] 1 2 3 4 5
b
## [1] 5 4 3 2 1
ab
##  [1] 1 2 3 4 5 5 4 3 2 1
a <- rep(c(1:2), times = 5)
b <- rep(c(1:2), each = 3)
d <- rep(c(1:2), times = 2, each = 3)
e <- rep(c(1:2), len = 5)
a
##  [1] 1 2 1 2 1 2 1 2 1 2
b
## [1] 1 1 1 2 2 2
d
##  [1] 1 1 1 2 2 2 1 1 1 2 2 2
e
## [1] 1 2 1 2 1
a <- seq(from = 1, to = 2, by = .1)
a <- seq(1, 2, by = .1)
b <- seq(1, 2, length.out = 7)

a
##  [1] 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0
b
## [1] 1.000000 1.166667 1.333333 1.500000 1.666667 1.833333 2.000000
a <- sample(c(1:5), size = 10, replace = TRUE)
b <- sample(c(1:10), size = 5, replace = FALSE)
a
##  [1] 2 4 1 2 3 1 2 1 1 3
b
## [1]  2  7  8  6 10
a <- runif(10)
a
##  [1] 0.8401158 0.7025864 0.0280010 0.9007178 0.8202461 0.3120585 0.8927049
##  [8] 0.9495726 0.5212997 0.6776818
set.seed(123)
a <- runif(10)
a
##  [1] 0.2875775 0.7883051 0.4089769 0.8830174 0.9404673 0.0455565 0.5281055
##  [8] 0.8924190 0.5514350 0.4566147
rm(list=ls())
aa <- vector(mode = "numeric", length = 10)
aa
##  [1] 0 0 0 0 0 0 0 0 0 0
bb <- numeric(length = 10)
bb
##  [1] 0 0 0 0 0 0 0 0 0 0
identical(aa, bb)
## [1] TRUE
cc <- character(length = 10)
cc
##  [1] "" "" "" "" "" "" "" "" "" ""
dd <- as.numeric(cc)
dd
##  [1] NA NA NA NA NA NA NA NA NA NA

Vector arithmetics

mt1 <- c(3, 5, 7, 9)

mt2 <- c(4, 6, 10, 10)

mt2 > mt1
## [1] TRUE TRUE TRUE TRUE
mt1 + mt2
## [1]  7 11 17 19
ave <- 0.5 * (mt1 + mt2)

mt3 <- c(0, 10)

mt2 + mt3
## [1]  4 16 10 20
fin <- c(6, 7, 8, 9, 10)

mt2 + fin
## Warning in mt2 + fin: longer object length is not a multiple of shorter object
## length
## [1] 10 13 18 19 14
max(mt1)
## [1] 9
max(mt1, mt2)
## [1] 10
pmax(mt1, mt2)
## [1]  4  6 10 10
pmax(mt1, 5)
## [1] 5 5 7 9

Indexing with vectors

a <- c(1, 2, 4)
a[1]
## [1] 1
a[c(1, 4)]
## [1]  1 NA
a[4] <- 40
a[c(1, 4)]
## [1]  1 40
a[10] <- 10
a
##  [1]  1  2  4 40 NA NA NA NA NA 10
rm(list = ls())

Named vectors

mt <- c(3, 5, 7)

students <- c("Ali", "Ayşe", "Fatma")

names(mt) <- students

mt
##   Ali  Ayşe Fatma 
##     3     5     7
mt["Ayşe"]
## Ayşe 
##    5
mt[c("Ayşe", "Ali")]
## Ayşe  Ali 
##    5    3

Lists

a <- list(a=c(1:10), b = c("a"), c = c(TRUE, FALSE, TRUE, FALSE))
a
## $a
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $b
## [1] "a"
## 
## $c
## [1]  TRUE FALSE  TRUE FALSE
summary(a)
##   Length Class  Mode     
## a 10     -none- numeric  
## b  1     -none- character
## c  4     -none- logical
a[1]
## $a
##  [1]  1  2  3  4  5  6  7  8  9 10
a[[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10
class(a[1])
## [1] "list"
class(a[[1]])
## [1] "integer"
a[[1]][3]
## [1] 3
a[[3]][1]
## [1] TRUE

Matrices

a <- matrix(1:15, ncol = 3, nrow = 5)
b <- matrix(c("a", "b", "c", "d", "e", "f"), ncol = 3, nrow = 2)

a
##      [,1] [,2] [,3]
## [1,]    1    6   11
## [2,]    2    7   12
## [3,]    3    8   13
## [4,]    4    9   14
## [5,]    5   10   15
b
##      [,1] [,2] [,3]
## [1,] "a"  "c"  "e" 
## [2,] "b"  "d"  "f"
class(a)
## [1] "matrix" "array"
str(a)
##  int [1:5, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
summary(a)
##        V1          V2           V3    
##  Min.   :1   Min.   : 6   Min.   :11  
##  1st Qu.:2   1st Qu.: 7   1st Qu.:12  
##  Median :3   Median : 8   Median :13  
##  Mean   :3   Mean   : 8   Mean   :13  
##  3rd Qu.:4   3rd Qu.: 9   3rd Qu.:14  
##  Max.   :5   Max.   :10   Max.   :15
a
##      [,1] [,2] [,3]
## [1,]    1    6   11
## [2,]    2    7   12
## [3,]    3    8   13
## [4,]    4    9   14
## [5,]    5   10   15
a[c(1,3),]
##      [,1] [,2] [,3]
## [1,]    1    6   11
## [2,]    3    8   13
a[c(1,15)]
## [1]  1 15
a[, 2]
## [1]  6  7  8  9 10
a[c(1,3), 2]
## [1] 6 8

Arrays

An array is a multidimensional object. A matrix is an nxm dimensional array.

aa <- array(c(1:12), dim = c(6, 2))
aa
##      [,1] [,2]
## [1,]    1    7
## [2,]    2    8
## [3,]    3    9
## [4,]    4   10
## [5,]    5   11
## [6,]    6   12
class(aa)
## [1] "matrix" "array"
bb <- array(c(1:24), dim = c(4, 3, 2))
# 2 4x3 matrices 
bb
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]   13   17   21
## [2,]   14   18   22
## [3,]   15   19   23
## [4,]   16   20   24
class(bb)
## [1] "array"
bb[3, 2, 1]
## [1] 7

Data frames

aa <- data.frame(a = 1:4, b = c("a", "b", "c", "d"), 
                 z = c(1, 3, 5, NA))
bb <- data.frame(a = 1, b = c("A", "B", "C", "D"), z = "Z", 
                 stringsAsFactors = FALSE)

aa
##   a b  z
## 1 1 a  1
## 2 2 b  3
## 3 3 c  5
## 4 4 d NA
bb
##   a b z
## 1 1 A Z
## 2 1 B Z
## 3 1 C Z
## 4 1 D Z
class(aa)
## [1] "data.frame"
names(aa)
## [1] "a" "b" "z"
str(aa)
## 'data.frame':    4 obs. of  3 variables:
##  $ a: int  1 2 3 4
##  $ b: chr  "a" "b" "c" "d"
##  $ z: num  1 3 5 NA
summary(aa)
##        a             b                   z    
##  Min.   :1.00   Length:4           Min.   :1  
##  1st Qu.:1.75   Class :character   1st Qu.:2  
##  Median :2.50   Mode  :character   Median :3  
##  Mean   :2.50                      Mean   :3  
##  3rd Qu.:3.25                      3rd Qu.:4  
##  Max.   :4.00                      Max.   :5  
##                                    NA's   :1
cc <- cbind(aa, bb)
cc
##   a b  z a b z
## 1 1 a  1 1 A Z
## 2 2 b  3 1 B Z
## 3 3 c  5 1 C Z
## 4 4 d NA 1 D Z
dd <- rbind(aa, bb)
dd
##   a b    z
## 1 1 a    1
## 2 2 b    3
## 3 3 c    5
## 4 4 d <NA>
## 5 1 A    Z
## 6 1 B    Z
## 7 1 C    Z
## 8 1 D    Z

Use the merge function to merge two data frames

aa$a
## [1] 1 2 3 4
aa[,"a"]
## [1] 1 2 3 4
aa[,1]
## [1] 1 2 3 4
aa["a"]
##   a
## 1 1
## 2 2
## 3 3
## 4 4
aa[["a"]]
## [1] 1 2 3 4
vname <- "a"
aa[, vname]
## [1] 1 2 3 4
aa[vname]
##   a
## 1 1
## 2 2
## 3 3
## 4 4
aa[1,]
##   a b z
## 1 1 a 1
aa[1:2,]
##   a b z
## 1 1 a 1
## 2 2 b 3
aa[c(1,3),]
##   a b z
## 1 1 a 1
## 3 3 c 5
# Do not forget the comma at the end
aa[1:2,2]
## [1] "a" "b"
aa[1:2,"b"]
## [1] "a" "b"
aa["b"][1:2,]
## [1] "a" "b"
aa[1:2,]$b
## [1] "a" "b"
aa$x <- c(4, 8, 1.5, 7)
aa$y <- aa$a * aa$x
aa
##   a b  z   x    y
## 1 1 a  1 4.0  4.0
## 2 2 b  3 8.0 16.0
## 3 3 c  5 1.5  4.5
## 4 4 d NA 7.0 28.0
aa$x <- NULL
aa
##   a b  z    y
## 1 1 a  1  4.0
## 2 2 b  3 16.0
## 3 3 c  5  4.5
## 4 4 d NA 28.0
aa$y <- sqrt(aa$y) 
aa
##   a b  z        y
## 1 1 a  1 2.000000
## 2 2 b  3 4.000000
## 3 3 c  5 2.121320
## 4 4 d NA 5.291503
aa$y[1] <- 10
aa
##   a b  z         y
## 1 1 a  1 10.000000
## 2 2 b  3  4.000000
## 3 3 c  5  2.121320
## 4 4 d NA  5.291503
aa[1,2] <- "aaa"
aa
##   a   b  z         y
## 1 1 aaa  1 10.000000
## 2 2   b  3  4.000000
## 3 3   c  5  2.121320
## 4 4   d NA  5.291503
aa[aa$a > 2, ]
##   a b  z        y
## 3 3 c  5 2.121320
## 4 4 d NA 5.291503
aa[aa$a > 2, c(1:3)]
##   a b  z
## 3 3 c  5
## 4 4 d NA
aa[aa$a < 3 & (aa$b == "aaa" | aa$z == 3), c(1:3)]
##   a   b z
## 1 1 aaa 1
## 2 2   b 3
aa[aa$a < 2, c(1:3)] <- 1000
aa
##      a    b    z         y
## 1 1000 1000 1000 10.000000
## 2    2    b    3  4.000000
## 3    3    c    5  2.121320
## 4    4    d   NA  5.291503
aaa  <- na.omit(aa)
aaa
##      a    b    z        y
## 1 1000 1000 1000 10.00000
## 2    2    b    3  4.00000
## 3    3    c    5  2.12132

Be very careful in using na.omit!

Data frames - wide and long formats

GDP growth rate data

Country 2000 2001 2002 2003 2004
Germany 0.0409 0.0013 0.0126 0.0126 0.0001
Korea 0.0262 0.0232 0.0186 0.0244 0.0146
Turkey 0.0384 0.0011 0.0099 0.0399 0.0316
US 0.0220 0.0107 0.0143 0.0318 0.0274

How many variables are there in the data?

How many observations?

Data frames - wide and long formats

How many variables? 3 variables (country, year, gdp growth rate)

How many observations? 4x5 = 20 observations

GDP growth rate data - long format

Country year gdpgr
Germany 2000 0.0013
Germany 2001 0.0013
Germany 2002 0.0126
Germany 2003 0.0116

Use the reshape function to convert wide-to-long and long-to-wide