ECON 413
Visualization

Erol Taymaz
Department of Economics
Middle East Technical University

Topics

Data visualization

“The use of computer-generated, interactive, visual representations of data to amplify cognition.”

[Card, Mackinlay, & Shneiderman 1999]

R objects

Refresh our memory: Everything in R is an object

a <- c(1:5)
a
## [1] 1 2 3 4 5
sum(a)
## [1] 15
sum
## function (..., na.rm = FALSE)  .Primitive("sum")
b <- plot(a)

b
## NULL
a <- rnorm(100)
b <- a + rnorm(100)
model_1 <- lm(a ~ b)
model_1
## 
## Call:
## lm(formula = a ~ b)
## 
## Coefficients:
## (Intercept)            b  
##     0.03902      0.51023
summary(a)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.82038 -0.83288  0.07229 -0.12307  0.55630  1.73368
summary(model_1)
## 
## Call:
## lm(formula = a ~ b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.94008 -0.64831  0.04819  0.55899  1.69512 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.03902    0.08372   0.466    0.642    
## b            0.51023    0.05815   8.775 5.49e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8165 on 98 degrees of freedom
## Multiple R-squared:   0.44,  Adjusted R-squared:  0.4343 
## F-statistic:    77 on 1 and 98 DF,  p-value: 5.495e-14
plot(a)

plot(a, b)

plot(model_1)

Data Science Process

Data Science Process

Data handling

Cleaning the data

Exploratory visualization

Example 1

Load the libraries (install them in not installed)

library(mice)
library(ggplot2)
library(GGally)
library(ggthemes)
library(haven)
library(data.table)

Example 1

Read the dataset

dt <- read.csv("https://raw.githubusercontent.com/etaymaz/econ413/master/Data/003_data.csv")
set.seed(123)
dt <- data.frame(id = c(1:1000), exper = rnorm(1000, 0, 1), 
    gender = c(1:2), educ = sample(5, 1000, replace = TRUE))

dt$b[runif(100)<0.1] <- NA
dt$d[runif(100)<0.1] <- NA

dt <- within(dt, wage <- gender + (gender-1)*educ  + 
               (exper * (gender-1) * (educ/5)) + rnorm(1000, 0, 1))

Example 1

View the data

class(dt)
## [1] "data.frame"
names(dt)
## [1] "id"     "exper"  "gender" "educ"   "wage"
dim(dt)
## [1] 1000    5
str(dt)
## 'data.frame':    1000 obs. of  5 variables:
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ exper : num  -0.87 1.503 0.642 NA -0.712 ...
##  $ gender: int  1 2 1 2 1 2 1 2 1 2 ...
##  $ educ  : int  NA 3 2 NA 3 5 5 2 5 5 ...
##  $ wage  : num  NA 6.5486 0.0909 NA -1.4579 ...
head(dt)
##   id      exper gender educ        wage
## 1  1 -0.8699983      1   NA          NA
## 2  2  1.5026771      2    3  6.54857412
## 3  3  0.6415719      1    2  0.09093394
## 4  4         NA      2   NA          NA
## 5  5 -0.7122940      1    3 -1.45789677
## 6  6  2.0612542      2    5  8.11485313
tail(dt)
##        id       exper gender educ       wage
## 995   995 -0.19416947      1    4 -0.3775560
## 996   996          NA      2    1         NA
## 997   997 -0.51001423      1    1 -0.3303003
## 998   998 -0.19601343      2    5  6.3701062
## 999   999 -0.01743695      1    5  1.1119297
## 1000 1000          NA      2    3         NA
table(dt$educ)
## 
##   1   2   3   4   5 
## 178 179 177 184 182
table(dt$gender, dt$educ)
##    
##       1   2   3   4   5
##   1  90  85  97 100  88
##   2  88  94  80  84  94
xtabs( ~ gender + educ, data = dt)
##       educ
## gender   1   2   3   4   5
##      1  90  85  97 100  88
##      2  88  94  80  84  94

Example 1

Missing values

md.pattern(dt)

##     id gender educ exper wage    
## 780  1      1    1     1    1   0
## 120  1      1    1     0    0   2
## 70   1      1    0     1    0   2
## 30   1      1    0     0    0   3
##      0      0  100   150  220 470

Descriptive statistics

summary(dt)
##        id             exper              gender         educ      
##  Min.   :   1.0   Min.   :-2.64866   Min.   :1.0   Min.   :1.000  
##  1st Qu.: 250.8   1st Qu.:-0.60161   1st Qu.:1.0   1st Qu.:2.000  
##  Median : 500.5   Median : 0.04126   Median :1.5   Median :3.000  
##  Mean   : 500.5   Mean   : 0.04952   Mean   :1.5   Mean   :3.014  
##  3rd Qu.: 750.2   3rd Qu.: 0.69551   3rd Qu.:2.0   3rd Qu.:4.000  
##  Max.   :1000.0   Max.   : 3.38499   Max.   :2.0   Max.   :5.000  
##                   NA's   :150                      NA's   :100    
##       wage        
##  Min.   :-1.8845  
##  1st Qu.: 0.9648  
##  Median : 2.1786  
##  Mean   : 2.8972  
##  3rd Qu.: 4.7516  
##  Max.   :10.8053  
##  NA's   :220
summary(subset(dt, gender == 1))
##        id            exper              gender       educ      
##  Min.   :  1.0   Min.   :-2.64866   Min.   :1   Min.   :1.000  
##  1st Qu.:250.5   1st Qu.:-0.58780   1st Qu.:1   1st Qu.:2.000  
##  Median :500.0   Median : 0.05770   Median :1   Median :3.000  
##  Mean   :500.0   Mean   : 0.05215   Mean   :1   Mean   :3.024  
##  3rd Qu.:749.5   3rd Qu.: 0.66337   3rd Qu.:1   3rd Qu.:4.000  
##  Max.   :999.0   Max.   : 3.38499   Max.   :1   Max.   :5.000  
##                  NA's   :50                     NA's   :40     
##       wage        
##  Min.   :-1.8845  
##  1st Qu.: 0.3509  
##  Median : 1.0158  
##  Mean   : 0.9996  
##  3rd Qu.: 1.6446  
##  Max.   : 3.6328  
##  NA's   :90
summary(subset(dt, gender == 2))
##        id             exper              gender       educ      
##  Min.   :   2.0   Min.   :-2.57297   Min.   :2   Min.   :1.000  
##  1st Qu.: 251.5   1st Qu.:-0.61461   1st Qu.:2   1st Qu.:2.000  
##  Median : 501.0   Median : 0.03077   Median :2   Median :3.000  
##  Mean   : 501.0   Mean   : 0.04656   Mean   :2   Mean   :3.005  
##  3rd Qu.: 750.5   3rd Qu.: 0.74073   3rd Qu.:2   3rd Qu.:4.000  
##  Max.   :1000.0   Max.   : 3.11620   Max.   :2   Max.   :5.000  
##                   NA's   :100                    NA's   :60     
##       wage        
##  Min.   : 0.4935  
##  1st Qu.: 3.6629  
##  Median : 4.8601  
##  Mean   : 4.9998  
##  3rd Qu.: 6.4615  
##  Max.   :10.8053  
##  NA's   :130

Example 1

Correlations

# Correlations - variables 2-4
cor(dt[, 2:5])
##        exper gender educ wage
## exper      1     NA   NA   NA
## gender    NA      1   NA   NA
## educ      NA     NA    1   NA
## wage      NA     NA   NA    1
cor(dt[, 2:5], use = "complete.obs")
##              exper        gender         educ       wage
## exper   1.00000000 -0.0104530077 0.0236341481 0.09952051
## gender -0.01045301  1.0000000000 0.0009589141 0.80603867
## educ    0.02363415  0.0009589141 1.0000000000 0.27049172
## wage    0.09952051  0.8060386732 0.2704917170 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs")
##               exper       gender         educ       wage
## exper   1.000000000 -0.002892561  0.023634148 0.09952051
## gender -0.002892561  1.000000000 -0.006840444 0.80603867
## educ    0.023634148 -0.006840444  1.000000000 0.27049172
## wage    0.099520506  0.806038673  0.270491717 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs", method = "kendall")
##              exper       gender         educ       wage
## exper  1.000000000  0.006545028  0.018319032 0.04909647
## gender 0.006545028  1.000000000 -0.005881101 0.67278394
## educ   0.018319032 -0.005881101  1.000000000 0.14863020
## wage   0.049096475  0.672783940  0.148630199 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs", method = "spearman")
##              exper       gender         educ       wage
## exper  1.000000000  0.008011278  0.025838438 0.07262175
## gender 0.008011278  1.000000000 -0.006575184 0.82346099
## educ   0.025838438 -0.006575184  1.000000000 0.19371484
## wage   0.072621750  0.823460990  0.193714836 1.00000000

Example 1

Plot all data all together

library(GGally)
ggpairs(dt)

Example 1: Regression model

Determinants of wages

model1 <- lm(wage ~ gender + educ + exper, data = dt)
summary(model1)
## 
## Call:
## lm(formula = wage ~ gender + educ + exper, data = dt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4503 -0.8866  0.0170  0.8475  4.2938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.42967    0.17362 -25.513  < 2e-16 ***
## gender       4.00421    0.09210  43.476  < 2e-16 ***
## educ         0.47069    0.03268  14.401  < 2e-16 ***
## exper        0.26214    0.04788   5.475 5.91e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.284 on 776 degrees of freedom
##   (220 observations deleted due to missingness)
## Multiple R-squared:  0.7328, Adjusted R-squared:  0.7317 
## F-statistic: 709.3 on 3 and 776 DF,  p-value: < 2.2e-16
str(model1)
## List of 13
##  $ coefficients : Named num [1:4] -4.43 4.004 0.471 0.262
##   ..- attr(*, "names")= chr [1:4] "(Intercept)" "gender" "educ" "exper"
##  $ residuals    : Named num [1:780] 1.164 -0.593 -2.258 1.642 1.748 ...
##   ..- attr(*, "names")= chr [1:780] "2" "3" "5" "6" ...
##  $ effects      : Named num [1:780] -80.91 55.79 -18.67 -7.03 1.87 ...
##   ..- attr(*, "names")= chr [1:780] "(Intercept)" "gender" "educ" "exper" ...
##  $ rank         : int 4
##  $ fitted.values: Named num [1:780] 5.385 0.684 0.8 6.473 1.87 ...
##   ..- attr(*, "names")= chr [1:780] "2" "3" "5" "6" ...
##  $ assign       : int [1:4] 0 1 2 3
##  $ qr           :List of 5
##   ..$ qr   : num [1:780, 1:4] -27.9285 0.0358 0.0358 0.0358 0.0358 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:780] "2" "3" "5" "6" ...
##   .. .. ..$ : chr [1:4] "(Intercept)" "gender" "educ" "exper"
##   .. ..- attr(*, "assign")= int [1:4] 0 1 2 3
##   ..$ qraux: num [1:4] 1.04 1.04 1 1.07
##   ..$ pivot: int [1:4] 1 2 3 4
##   ..$ tol  : num 1e-07
##   ..$ rank : int 4
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 776
##  $ na.action    : 'omit' Named int [1:220] 1 4 10 20 27 28 32 39 40 42 ...
##   ..- attr(*, "names")= chr [1:220] "1" "4" "10" "20" ...
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = wage ~ gender + educ + exper, data = dt)
##  $ terms        :Classes 'terms', 'formula'  language wage ~ gender + educ + exper
##   .. ..- attr(*, "variables")= language list(wage, gender, educ, exper)
##   .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:4] "wage" "gender" "educ" "exper"
##   .. .. .. ..$ : chr [1:3] "gender" "educ" "exper"
##   .. ..- attr(*, "term.labels")= chr [1:3] "gender" "educ" "exper"
##   .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(wage, gender, educ, exper)
##   .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:4] "wage" "gender" "educ" "exper"
##  $ model        :'data.frame':   780 obs. of  4 variables:
##   ..$ wage  : num [1:780] 6.5486 0.0909 -1.4579 8.1149 3.6185 ...
##   ..$ gender: int [1:780] 2 1 1 2 1 2 1 1 2 1 ...
##   ..$ educ  : int [1:780] 3 2 3 5 5 2 5 4 3 2 ...
##   ..$ exper : num [1:780] 1.503 0.642 -0.712 2.061 -0.22 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language wage ~ gender + educ + exper
##   .. .. ..- attr(*, "variables")= language list(wage, gender, educ, exper)
##   .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:4] "wage" "gender" "educ" "exper"
##   .. .. .. .. ..$ : chr [1:3] "gender" "educ" "exper"
##   .. .. ..- attr(*, "term.labels")= chr [1:3] "gender" "educ" "exper"
##   .. .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(wage, gender, educ, exper)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:4] "wage" "gender" "educ" "exper"
##   ..- attr(*, "na.action")= 'omit' Named int [1:220] 1 4 10 20 27 28 32 39 40 42 ...
##   .. ..- attr(*, "names")= chr [1:220] "1" "4" "10" "20" ...
##  - attr(*, "class")= chr "lm"

Example 1 - Exploratory visualizations

Plots

ggplot(dt, aes(x = exper)) + geom_histogram()

ggplot(dt, aes(x = exper)) + geom_density()

ggplot(dt, aes(x = exper)) + geom_density() +
 stat_function(fun = dnorm, args = list(mean = 0, sd = 1), col="blue")

ggplot(dt, aes(x = educ)) + geom_histogram()

ggplot(dt, aes(x = exper, y = wage)) + geom_point()

ggplot(dt, aes(x = exper, y = wage)) + geom_point() + 
    geom_smooth(formula = y ~ x)

ggplot(dt, aes(x = exper, y = wage)) + geom_point() + 
    geom_smooth(formula = y ~ x, se = FALSE, method = "lm")

ggplot(dt, aes(x = exper, y = wage)) + geom_point() + facet_wrap(~ gender) + 
    geom_smooth(formula = y ~ x, se = FALSE, method = "lm")

ggplot(dt, aes(x = exper, y = wage)) + geom_point() + facet_wrap(gender ~ educ, nrow = 2) + 
    geom_smooth(formula = y ~ x, se = FALSE, method = "lm")

Example 1: Regression model

Determinants of wages

dt$educ <- as.factor(dt$educ)

model2 <- lm(wage ~ educ + exper + educ*exper, data = subset(dt, gender == 1))
summary(model2)
## 
## Call:
## lm(formula = wage ~ educ + exper + educ * exper, data = subset(dt, 
##     gender == 1))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.73114 -0.61771 -0.00812  0.62456  2.42288 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.07431    0.10991   9.774   <2e-16 ***
## educ2        0.03364    0.15597   0.216   0.8293    
## educ3       -0.28991    0.15232  -1.903   0.0577 .  
## educ4       -0.22512    0.15384  -1.463   0.1441    
## educ5        0.13026    0.15806   0.824   0.4103    
## exper        0.07463    0.11116   0.671   0.5024    
## educ2:exper -0.17990    0.14946  -1.204   0.2294    
## educ3:exper  0.14172    0.15716   0.902   0.3677    
## educ4:exper -0.02000    0.16431  -0.122   0.9032    
## educ5:exper -0.03407    0.17713  -0.192   0.8476    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9827 on 400 degrees of freedom
##   (90 observations deleted due to missingness)
## Multiple R-squared:  0.03855,    Adjusted R-squared:  0.01692 
## F-statistic: 1.782 on 9 and 400 DF,  p-value: 0.06979
model3 <- lm(wage ~ educ + exper + educ*exper, data = subset(dt, gender == 2))
summary(model3)
## 
## Call:
## lm(formula = wage ~ educ + exper + educ * exper, data = subset(dt, 
##     gender == 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.86437 -0.77670 -0.01112  0.75068  2.91145 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.00383    0.12360  24.303  < 2e-16 ***
## educ2        1.05102    0.17146   6.130 2.31e-09 ***
## educ3        2.03922    0.17726  11.504  < 2e-16 ***
## educ4        2.96237    0.17822  16.622  < 2e-16 ***
## educ5        3.87530    0.17191  22.543  < 2e-16 ***
## exper        0.05794    0.11777   0.492 0.623058    
## educ2:exper  0.23215    0.17072   1.360 0.174750    
## educ3:exper  0.61271    0.16890   3.628 0.000327 ***
## educ4:exper  0.86182    0.18209   4.733 3.19e-06 ***
## educ5:exper  1.04711    0.18331   5.712 2.34e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.055 on 360 degrees of freedom
##   (130 observations deleted due to missingness)
## Multiple R-squared:  0.6858, Adjusted R-squared:  0.6779 
## F-statistic: 87.31 on 9 and 360 DF,  p-value: < 2.2e-16

Explanatory visualization

Visualization process 1

Visualization process 2

Visualization process 3

Choose the graphical tool

Source: http://curleylab.psych.columbia.edu/netviz/netviz1.html

Visualization process 4

Design the components

Visualization process 4

Design the components

Source: https://isabella-b.com/blog/ggplot2-theme-elements-reference/

Data visualization checklist

Data visualization checklist

Source: Stephanie Evergreen and Ann K. Emery, Data Visualization Checklist, http://AnnKEmery.com/blog

Visualization examples: Case 1

Bar chart vs pie chart

Visualization examples: Case 2

Breakfast preferences: Case 2

Breakfast preferences: Case 2

Source: Stephanie Evergreen and Ann K. Emery, Data Visualization Checklist, http://AnnKEmery.com/blog

Visualization examples: Case 3

Program evaluation: Case 3

Program evaluation: Case 3

Source: Angie Ficek Data Visualization Techniques: How to Tell Your Story to Make it Stick, http://www.cehd.umn.edu/olpd/mesi/spring/2015/ficek-datavis.pdf

ggplot basics

ggplot grammer

Example 2: ggplot components

geoms

geom’s define chart type. Variables should be map to aesthetics.

scales

Scales control the mapping between data and aesthetics.

Coordinate systems

Coordinate systems adjust the mapping from coordinates to the 2d plane of the computer screen.

Faceting

Facets display subsets of the dataset in different panels.

For more information, see ggplot2 web site: http://docs.ggplot2.org/current/

ggplot: Scatter chart

# Read the data
dt <- read.csv("https://raw.githubusercontent.com/etaymaz/econ413/master/Data/003_data.csv")

# Make it data table
dt <- data.table(dt)

# Scatter chart with 2 variables
ggplot(dt, aes(x = exper, y = wage)) + geom_point()

# Scatter chart with 3 variables - continuous color
ggplot(dt, aes(x = exper, y = wage, color = -educ)) + geom_point()

# Scatter chart with 3 variables - discrete color
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ))) + geom_point()

# Scatter chart with 4 variables
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender))) + geom_point()

# Scatter chart with 4 variables and large shapes
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender))) + 
  geom_point(size=3)

# Scatter chart with 5 variables 
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender), alpha = educ)) + 
  geom_point(size=3)

# Scatter chart with 4 variables and large and transparent shapes
ggplot(dt, aes(x = exper, y = wage, color = as.factor(educ), shape = as.factor(gender))) + 
  geom_point(size=3, alpha = 0.5)

ggplot: Line chart

# Calculate mean values by gender and education
dtm <- dt[, list(mwage = mean(wage, na.rm = T), mexper = mean(exper, na.rm = T)), by = list(gender, educ)]

# Line chart with 3 variables
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line()

# Line chart with 3 variables - thick line
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line(size = 2)

# Line chart with 3 variables with points
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line() + geom_point()

# Line chart with 3 variables with points - point size by mean experience
ggplot(dtm, aes(x = educ, y = mwage, color = as.factor(gender))) + geom_line() + geom_point(aes(size = mexper))

ggplot: Bar chart

# Bar chart with 1 variable
ggplot(dt, aes(x = educ)) + geom_bar()

# Bar chart with 2 variables
ggplot(dt, aes(x = educ, weight = wage)) + geom_bar() 

# Bar chart with 3 variables
ggplot(dt, aes(x = educ, weight = wage, fill = as.factor(educ))) + geom_bar() 

# Bar chart with 3 variables
ggplot(dt, aes(x = educ, weight = wage, fill = as.factor(gender))) + geom_bar() 

Example 2

Example 2: Download the data

# Download the data
pwt <- read_stata("http://www.rug.nl/ggdc/docs/pwt90.dta")

# Make the data a data.table object
pwt <- data.table(pwt)

Example 2: Check the variables

# Class
class(pwt)
## [1] "data.table" "data.frame"
# Variable names
names(pwt)
##  [1] "countrycode"   "country"       "currency_unit" "year"         
##  [5] "rgdpe"         "rgdpo"         "pop"           "emp"          
##  [9] "avh"           "hc"            "ccon"          "cda"          
## [13] "cgdpe"         "cgdpo"         "ck"            "ctfp"         
## [17] "cwtfp"         "rgdpna"        "rconna"        "rdana"        
## [21] "rkna"          "rtfpna"        "rwtfpna"       "labsh"        
## [25] "delta"         "xr"            "pl_con"        "pl_da"        
## [29] "pl_gdpo"       "i_cig"         "i_xm"          "i_xr"         
## [33] "i_outlier"     "cor_exp"       "statcap"       "csh_c"        
## [37] "csh_i"         "csh_g"         "csh_x"         "csh_m"        
## [41] "csh_r"         "pl_c"          "pl_i"          "pl_g"         
## [45] "pl_x"          "pl_m"          "pl_k"
# Dimension
dim(pwt)
## [1] 11830    47

Example 2: Keep main variables and check the data

# Select a subset of variables
pwts <- pwt[, list(countrycode, year, rgdpo, pop, emp, avh, hc, rgdpna, rkna)]

# Structure
str(pwts)
## Classes 'data.table' and 'data.frame':   11830 obs. of  9 variables:
##  $ countrycode: chr  "ABW" "ABW" "ABW" "ABW" ...
##   ..- attr(*, "label")= chr "3-letter ISO country code"
##   ..- attr(*, "format.stata")= chr "%9s"
##  $ year       : num  1950 1951 1952 1953 1954 ...
##   ..- attr(*, "label")= chr "Year"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rgdpo      : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Output-side real GDP at chained PPPs (in mil. 2011US$)"
##   ..- attr(*, "format.stata")= chr "%14.3g"
##  $ pop        : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Population (in millions)"
##   ..- attr(*, "format.stata")= chr "%10.0g"
##  $ emp        : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Number of persons engaged (in millions)"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ avh        : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Average annual hours worked by persons engaged (source: The Conference Board)"
##   ..- attr(*, "format.stata")= chr "%10.0gc"
##  $ hc         : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Human capital index, see note hc"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ rgdpna     : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Real GDP at constant 2011 national prices (in mil. 2011US$)"
##   ..- attr(*, "format.stata")= chr "%14.3g"
##  $ rkna       : num  NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Capital stock at constant 2011 national prices (in mil. 2011US$)"
##   ..- attr(*, "format.stata")= chr "%14.3g"
##  - attr(*, ".internal.selfref")=<externalptr>
# Summary statistics
summary(pwts)
##  countrycode             year          rgdpo               pop           
##  Length:11830       Min.   :1950   Min.   :       1   Min.   :   0.0044  
##  Class :character   1st Qu.:1966   1st Qu.:    6045   1st Qu.:   1.5934  
##  Mode  :character   Median :1982   Median :   24874   Median :   5.9857  
##                     Mean   :1982   Mean   :  250855   Mean   :  30.0906  
##                     3rd Qu.:1998   3rd Qu.:  132475   3rd Qu.:  19.5332  
##                     Max.   :2014   Max.   :17135952   Max.   :1369.4357  
##                                    NA's   :2391       NA's   :2391       
##       emp               avh             hc            rgdpna        
##  Min.   :  0.001   Min.   :1363   Min.   :1.007   Min.   :       8  
##  1st Qu.:  0.942   1st Qu.:1816   1st Qu.:1.408   1st Qu.:    6277  
##  Median :  2.976   Median :1979   Median :1.917   Median :   29303  
##  Mean   : 14.219   Mean   :1996   Mean   :2.033   Mean   :  277098  
##  3rd Qu.:  8.404   3rd Qu.:2177   3rd Qu.:2.609   3rd Qu.:  167537  
##  Max.   :798.368   Max.   :3042   Max.   :3.734   Max.   :17150538  
##  NA's   :3586      NA's   :8511   NA's   :3963    NA's   :2391      
##       rkna         
##  Min.   :      31  
##  1st Qu.:   16958  
##  Median :   85975  
##  Mean   :  899559  
##  3rd Qu.:  454699  
##  Max.   :67590072  
##  NA's   :2421
# Missing observations
md.pattern(pwts)

##      countrycode year rgdpo  pop rgdpna rkna  emp   hc  avh      
## 3289           1    1     1    1      1    1    1    1    1     0
## 4006           1    1     1    1      1    1    1    1    0     1
## 30             1    1     1    1      1    1    1    0    1     1
## 919            1    1     1    1      1    1    1    0    0     2
## 572            1    1     1    1      1    1    0    1    0     2
## 593            1    1     1    1      1    1    0    0    0     3
## 30             1    1     1    1      1    0    0    0    0     4
## 2391           1    1     0    0      0    0    0    0    0     7
##                0    0  2391 2391   2391 2421 3586 3963 8511 25654

Example 2: Exploratory visualization

# Generate GDP per capita variable - 000 USD
pwts[, gdpc := (rgdpo / pop) / 1000]

# Check the data
summary(pwts)
##  countrycode             year          rgdpo               pop           
##  Length:11830       Min.   :1950   Min.   :       1   Min.   :   0.0044  
##  Class :character   1st Qu.:1966   1st Qu.:    6045   1st Qu.:   1.5934  
##  Mode  :character   Median :1982   Median :   24874   Median :   5.9857  
##                     Mean   :1982   Mean   :  250855   Mean   :  30.0906  
##                     3rd Qu.:1998   3rd Qu.:  132475   3rd Qu.:  19.5332  
##                     Max.   :2014   Max.   :17135952   Max.   :1369.4357  
##                                    NA's   :2391       NA's   :2391       
##       emp               avh             hc            rgdpna        
##  Min.   :  0.001   Min.   :1363   Min.   :1.007   Min.   :       8  
##  1st Qu.:  0.942   1st Qu.:1816   1st Qu.:1.408   1st Qu.:    6277  
##  Median :  2.976   Median :1979   Median :1.917   Median :   29303  
##  Mean   : 14.219   Mean   :1996   Mean   :2.033   Mean   :  277098  
##  3rd Qu.:  8.404   3rd Qu.:2177   3rd Qu.:2.609   3rd Qu.:  167537  
##  Max.   :798.368   Max.   :3042   Max.   :3.734   Max.   :17150538  
##  NA's   :3586      NA's   :8511   NA's   :3963    NA's   :2391      
##       rkna               gdpc         
##  Min.   :      31   Min.   :   0.133  
##  1st Qu.:   16958   1st Qu.:   2.019  
##  Median :   85975   Median :   5.410  
##  Mean   :  899559   Mean   :  19.478  
##  3rd Qu.:  454699   3rd Qu.:  13.703  
##  Max.   :67590072   Max.   :4447.840  
##  NA's   :2421       NA's   :2391
# Find the outlier
pwts[gdpc > 1000]
##     countrycode year     rgdpo      pop       emp avh hc   rgdpna      rkna
##  1:         BMU 1970 118924.79 0.052286        NA  NA NA 1634.187  5616.710
##  2:         BMU 1971 214744.45 0.052857        NA  NA NA 1690.362  5655.335
##  3:         BMU 1972 168675.36 0.053426        NA  NA NA 1721.003  5728.377
##  4:         BMU 1973 163793.48 0.053980        NA  NA NA 1746.537  5789.289
##  5:         BMU 1974 148884.08 0.054507        NA  NA NA 1761.858  5849.582
##  6:         BMU 1975 158232.27 0.054994        NA  NA NA 1828.246  5927.734
##  7:         BMU 1976 194509.84 0.055439        NA  NA NA 1991.665  5968.023
##  8:         BMU 1977 182189.16 0.055851        NA  NA NA 2042.242  6053.705
##  9:         BMU 1978 179753.22 0.056237        NA  NA NA 2083.601  6163.846
## 10:         BMU 1979 180668.77 0.056613        NA  NA NA 2220.414  6307.740
## 11:         BMU 1980 154140.38 0.056990        NA  NA NA 2304.830  6537.652
## 12:         BMU 1981 150503.89 0.057370        NA  NA NA 2223.688  6706.583
## 13:         BMU 1982 141148.67 0.057752        NA  NA NA 2225.750  6959.545
## 14:         BMU 1983 146613.58 0.058138        NA  NA NA 2245.035  7217.527
## 15:         BMU 1984 155851.80 0.058528        NA  NA NA 2199.310  7533.817
## 16:         BMU 1985 119540.04 0.058923        NA  NA NA 2292.701  7784.550
## 17:         BMU 1986 169751.81 0.059324 0.0334450  NA NA 2394.340  7934.764
## 18:         BMU 1987 265669.47 0.059730 0.0352440  NA NA 2489.551  8121.300
## 19:         BMU 1988 182990.47 0.060138 0.0364200  NA NA 2522.420  8423.078
## 20:         BMU 1989  96493.66 0.060539 0.0362370  NA NA 2525.573  8607.367
## 21:         BMU 1990 168599.41 0.060931 0.0355730  NA NA 2458.744  8666.035
## 22:         BMU 1991 179122.67 0.061311 0.0346210  NA NA 2465.172  8659.007
## 23:         BMU 1992 146723.64 0.061680 0.0336500  NA NA 2397.979  8664.764
## 24:         BMU 1993 156847.22 0.062036 0.0334270  NA NA 2456.076  8685.491
## 25:         BMU 1995 135433.84 0.062694 0.0341330  NA NA 2629.517  8728.345
## 26:         BMU 1996 113023.91 0.062989 0.0346330  NA NA 2731.399  8880.891
## 27:         BMU 1997 183042.34 0.063257 0.0352960  NA NA 2857.200  9099.609
## 28:         BMU 2000  80290.86 0.064035 0.0374475  NA NA 3161.136 10313.338
## 29:         BMU 2002 134871.80 0.064614 0.0378150  NA NA 3339.322 10428.872
##     countrycode year     rgdpo      pop       emp avh hc   rgdpna      rkna
##         gdpc
##  1: 2274.505
##  2: 4062.744
##  3: 3157.177
##  4: 3034.337
##  5: 2731.467
##  6: 2877.264
##  7: 3508.538
##  8: 3262.057
##  9: 3196.351
## 10: 3191.295
## 11: 2704.692
## 12: 2623.390
## 13: 2444.048
## 14: 2521.820
## 15: 2662.859
## 16: 2028.750
## 17: 2861.436
## 18: 4447.840
## 19: 3042.843
## 20: 1593.909
## 21: 2767.055
## 22: 2921.542
## 23: 2378.788
## 24: 2528.326
## 25: 2160.236
## 26: 1794.344
## 27: 2893.630
## 28: 1253.859
## 29: 2087.346
##         gdpc
# Drop the outlier
pwts <- pwts[!countrycode == "BMU"]

# Plot the GDP per capita data for all countries
# Define labels
clabs <-   labs(title = "GDP per capita",
                subtitle = "All countries",
                x = "", y = "Real GDP, PPP, 2011 prices",
                caption = "Source: Penn World Tables 9.0")

ggplot(data = pwts, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  clabs

# Remove legends
ggplot(data = pwts, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  theme(legend.position="none") +
  clabs

# Makegdpc logarithmic
ggplot(data = pwts, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  theme(legend.position="none") +
  scale_y_log10() +
  clabs

# Plot for a set of countries
clist <- c("TUR", "USA", "IND", "CHN", "KOR", "ESP")
pwtc <- subset(pwts, countrycode %in% clist)

ggplot(data = pwtc, mapping = aes(x = year, y = gdpc)) +
  geom_line() + 
  scale_y_log10() +
  facet_wrap(~ countrycode, nrow = 3, ncol = 3) +
  clabs

ggplot(data = pwtc, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  scale_y_log10() +
  clabs

Example 2: Scatter plot

# Labels
clabs <- labs(title = "GDP-human capital relation",
              subtitle = "All countries",
              x = "Human capital index", y = "Real GDP, PPP, 2011 prices",
              caption = "Source: Penn World Tables 9.0")

ggplot(data = pwts, mapping = aes(x = hc, y = gdpc, color = -year)) +
  geom_point() + 
  scale_y_log10() +
  clabs

# GDP per capita and education - selected years
ylist <- seq(1950, 2010, by = 10)

ggplot(data = subset(pwts, year %in% ylist), mapping = aes(x = hc, y = gdpc)) +
  geom_point() + 
  facet_wrap(~ year) +
  scale_y_log10() +
  clabs

# Add trend lines
ggplot(data = subset(pwts, year %in% ylist), mapping = aes(x = hc, y = gdpc)) +
  geom_point() + 
  geom_smooth(method = "lm", formula = y ~ x, , se = FALSE) + 
  facet_wrap(~ year) +
  scale_y_log10() +
  clabs

Example 2: Themes

# Define labels
clabs <-   labs(title = "GDP per capita",
                subtitle = "Selected countries",
                x = "", y = "Real GDP, PPP, 2011 prices",
                caption = "Source: Penn World Tables 9.0")

# Default plot
p1 <- ggplot(data = pwtc, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + 
  scale_y_log10() +
  clabs

# Minimal theme
p1 + theme_minimal()

# BW theme
p1 + theme_bw()

# Classic theme
p1 + theme_classic()

# Economist theme
p1 + theme_economist()

# WSJ theme
p1 + theme_wsj()

Example 2: Changing themes

Example 2: Changing themes

# Default plot
p1 <- ggplot(data = pwtc, mapping = aes(x = year, y = gdpc, color = countrycode)) +
  geom_line() + theme_classic() + clabs
p1

# Add points
p1 + geom_point()

# Add a trend line
p1 + geom_smooth(method = "lm", formula = y ~ x, se = FALSE)

# Add a quadratic trend line
p1 + geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE)

# Make line thicker
p1 <- p1 + geom_line(size = 1.5)
p1

# Change legend title
p1 <- p1 + scale_color_discrete(name = "Countries") 
p1

# Make title larger
p1 <- p1 +  theme(plot.title = element_text(size = rel(3)))
p1

# Make subtitle larger
p1 <- p1 +  theme(plot.subtitle = element_text(size = rel(2)))
p1

# Make caption left justified and larger
p1 <- p1 +  theme(plot.caption = element_text(hjust = 0, size = rel(1.5)))
p1

# Make caption gray
p1 <- p1 +  theme(plot.caption = element_text(color = "gray40"))
p1

# Add vertical line for crises years
p1 <- p1 +  geom_vline(xintercept = c(1980, 1994, 2001, 2009), size = 4, alpha = 0.25) 
p1

# Change legend position
p1 <- p1 +  theme(legend.position = c(0.15, 0.75))
p1

# Add text
p1 + annotate("text", x=1980, y=50, label="First crisis") 

# Add formula
p1 + annotate("text", x=1965, y=45, parse=TRUE, label="frac(1, sqrt(2 * pi)) * e ^ {-x^2 / 2}")