Linear regression

Create a data

set.seed(123)
dat <- data.frame(x1 = rnorm(100), x2 = rnorm(100), e = rnorm(100))

# Make some random observations
dat$x1[c(1, 2)] <- NA
dat$x2[c(3, 4)] <- NA

# Make an outlier
dat$e[5] <- 15
dat$y <- dat$x1 + 2*dat$x2 + dat$e

Descriptive statistics

str(dat)

## 'data.frame':    100 obs. of  4 variables:
##  $ x1: num  NA NA 1.5587 0.0705 0.1293 ...
##  $ x2: num  -0.71 0.257 NA NA -0.952 ...
##  $ e : num  2.199 1.312 -0.265 0.543 15 ...
##  $ y : num  NA NA NA NA 13.2 ...

summary(dat)

##        x1                x2                e                  y           
##  Min.   :-2.3092   Min.   :-2.0532   Min.   :-1.75653   Min.   :-5.19556  
##  1st Qu.:-0.4865   1st Qu.:-0.8335   1st Qu.:-0.53131   1st Qu.:-1.60513  
##  Median : 0.0906   Median :-0.2063   Median : 0.05345   Median : 0.01854  
##  Mean   : 0.1003   Mean   :-0.1037   Mean   : 0.27461   Mean   : 0.12979  
##  3rd Qu.: 0.6982   3rd Qu.: 0.5005   3rd Qu.: 0.84617   3rd Qu.: 1.56870  
##  Max.   : 2.1873   Max.   : 3.2410   Max.   :15.00000   Max.   :13.22605  
##  NA's   :2         NA's   :2                            NA's   :4

# Density charts
ggplot(dat, aes(x=y)) + geom_density()

# Scatter charts
ggplot(dat, aes(x=y, y=x1)) + geom_point()

ggplot(dat, aes(x=y, y=x2)) + geom_point()

# Make the oulier missing
dat$y[dat$y > 10] <- NA

# Check it
ggplot(dat, aes(x=y, y=x1)) + geom_point()

ggplot(dat, aes(x=y, y=x2)) + geom_point()

# Missing values
library(mice)
md.pattern(dat)

##    e x1 x2 y  
## 95 1  1  1 1 0
## 1  1  1  1 0 1
## 2  1  1  0 0 2
## 2  1  0  1 0 2
##    0  2  2 5 9

Linear regression

Use lm function (stats package)
- lm(formula, data, subset, weights, na.action, method = “qr”, model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, contrasts = NULL, offset, …)

model_1 <- lm(y ~ x1 + x2, data = dat)

class(model_1)

## [1] "lm"

str(model_1)

## List of 13
##  $ coefficients : Named num [1:3] 0.103 0.892 2.03
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "x1" "x2"
##  $ residuals    : Named num [1:95] -0.392 -0.818 -0.784 1.485 -0.233 ...
##   ..- attr(*, "names")= chr [1:95] "6" "7" "8" "9" ...
##  $ effects      : Named num [1:95] 0.0786 -6.9742 19.3705 1.4744 -0.3262 ...
##   ..- attr(*, "names")= chr [1:95] "(Intercept)" "x1" "x2" "" ...
##  $ rank         : int 3
##  $ fitted.values: Named num [1:95] 1.54 -1.08 -4.41 -1.28 1.57 ...
##   ..- attr(*, "names")= chr [1:95] "6" "7" "8" "9" ...
##  $ assign       : int [1:3] 0 1 2
##  $ qr           :List of 5
##   ..$ qr   : num [1:95, 1:3] -9.747 0.103 0.103 0.103 0.103 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:95] "6" "7" "8" "9" ...
##   .. .. ..$ : chr [1:3] "(Intercept)" "x1" "x2"
##   .. ..- attr(*, "assign")= int [1:3] 0 1 2
##   ..$ qraux: num [1:3] 1.1 1.03 1.19
##   ..$ pivot: int [1:3] 1 2 3
##   ..$ tol  : num 1e-07
##   ..$ rank : int 3
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 92
##  $ na.action    : 'omit' Named int [1:5] 1 2 3 4 5
##   ..- attr(*, "names")= chr [1:5] "1" "2" "3" "4" ...
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = y ~ x1 + x2, data = dat)
##  $ terms        :Classes 'terms', 'formula'  language y ~ x1 + x2
##   .. ..- attr(*, "variables")= language list(y, x1, x2)
##   .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:3] "y" "x1" "x2"
##   .. .. .. ..$ : chr [1:2] "x1" "x2"
##   .. ..- attr(*, "term.labels")= chr [1:2] "x1" "x2"
##   .. ..- attr(*, "order")= int [1:2] 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(y, x1, x2)
##   .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:3] "y" "x1" "x2"
##  $ model        :'data.frame':   95 obs. of  3 variables:
##   ..$ y : num [1:95] 1.149 -1.897 -5.196 0.204 1.338 ...
##   ..$ x1: num [1:95] 1.715 0.461 -1.265 -0.687 -0.446 ...
##   ..$ x2: num [1:95] -0.045 -0.785 -1.668 -0.38 0.919 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language y ~ x1 + x2
##   .. .. ..- attr(*, "variables")= language list(y, x1, x2)
##   .. .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:3] "y" "x1" "x2"
##   .. .. .. .. ..$ : chr [1:2] "x1" "x2"
##   .. .. ..- attr(*, "term.labels")= chr [1:2] "x1" "x2"
##   .. .. ..- attr(*, "order")= int [1:2] 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(y, x1, x2)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:3] "y" "x1" "x2"
##   ..- attr(*, "na.action")= 'omit' Named int [1:5] 1 2 3 4 5
##   .. ..- attr(*, "names")= chr [1:5] "1" "2" "3" "4" ...
##  - attr(*, "class")= chr "lm"

# Coefficients
model_1$coefficients

## (Intercept)          x1          x2 
##   0.1032738   0.8917218   2.0303056

# Residuals and fitted values
resid_1 <- model_1$residuals
fitted_1 <- model_1$fitted.values
res_1 <- data.frame(resid = resid_1, fit = fitted_1)

# Residual distribution
ggplot(res_1, aes(x=resid)) + geom_density()

ggplot(res_1, aes(x=resid)) + geom_density() +
  stat_function(fun=dnorm, args(list(mean=1, sd=0)), color="red")

ggplot(res_1, aes(x=fit, y =resid)) + geom_point(size=2)

# Model summary
summary(model_1)

## 
## Call:
## lm(formula = y ~ x1 + x2, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8394 -0.6558 -0.1030  0.6220  2.0864 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.10327    0.09757   1.058    0.293    
## x1           0.89172    0.10572   8.435 4.46e-13 ***
## x2           2.03031    0.09886  20.537  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9432 on 92 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.8382, Adjusted R-squared:  0.8346 
## F-statistic: 238.2 on 2 and 92 DF,  p-value: < 2.2e-16

sum_1 <- summary(model_1)
class(sum_1)

## [1] "summary.lm"

Linear regression

# Many models
model_2 <- lm(y ~ x1, data = dat)
model_3 <- lm(y ~ x2, data = dat)
model_4 <- lm(y ~ x1 + x2 - 1, data = dat)

stargazer(model_1, model_2, model_3, model_4, type="html")


	Dependent variable:

	y
	(1)	(2)	(3)	(4)

x1	0.892^***	0.781^***		0.901^***
	(0.106)	(0.248)		(0.105)

x2	2.030^***		1.988^***	2.021^***
	(0.099)		(0.131)	(0.099)

Constant	0.103	-0.074	0.175
	(0.098)	(0.228)	(0.129)


Observations	95	95	95	95
R²	0.838	0.096	0.713	0.836
Adjusted R²	0.835	0.086	0.710	0.833
Residual Std. Error	0.943 (df = 92)	2.217 (df = 93)	1.249 (df = 93)	0.944 (df = 93)
F Statistic	238.231^*** (df = 2; 92)	9.897^*** (df = 1; 93)	231.047^*** (df = 1; 93)	237.367^*** (df = 2; 93)

Note:	p<0.1; p<0.05; p<0.01

Linear regression

Factor variables

set.seed(123)
dat <- data.frame(x1 = rnorm(90), x2 = rep(c("A", "B", "C"), each = 30), e=rnorm(90))
dat$y <- ifelse(dat$x2 == "A", 0, ifelse(dat$x2 == "B", 1, 2)) + dat$x1 + dat$e

model_f <- lm(y ~ x1 + x2, data = dat)

summary(model_f)

## 
## Call:
## lm(formula = y ~ x1 + x2, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8829 -0.6312 -0.1114  0.5872  3.0967 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.09328    0.18456  -0.505 0.614546    
## x1           1.01282    0.12070   8.391 8.53e-13 ***
## x2B          0.90742    0.26230   3.459 0.000845 ***
## x2C          2.24669    0.26103   8.607 3.11e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.01 on 86 degrees of freedom
## Multiple R-squared:  0.6347, Adjusted R-squared:  0.622 
## F-statistic: 49.81 on 3 and 86 DF,  p-value: < 2.2e-16

Panel data

fixest package

library(fixest)

set.seed(123)
dat <- data.table(firmid = rep(LETTERS[1:10], each = 10),
                  fe = rep(c(1:10), each = 10))
dat[, x := fe + rep(c(1:10)/2, times = 10) + rnorm(100)]
dat[, y := 5*fe - 2 * x + rnorm(100)]

m1 <- lm(y ~ x, data = dat)
summary(m1)

## 
## Call:
## lm(formula = y ~ x, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.7875  -5.7866  -0.6283   6.3347  18.6754 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.9569     1.9887  -1.487     0.14    
## x             1.6388     0.2207   7.427 4.15e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.534 on 98 degrees of freedom
## Multiple R-squared:  0.3601, Adjusted R-squared:  0.3536 
## F-statistic: 55.16 on 1 and 98 DF,  p-value: 4.151e-11

ggplot(dat,aes(x, y)) + geom_point() +
    geom_smooth(method = "lm", formula = "y ~ x", se = FALSE)

ggplot(dat,aes(x, y, color = firmid)) + geom_point() + 
  geom_smooth(method = "lm", formula = "y ~ x", se = FALSE)

m2 <- lm(y ~ x + firmid, data = dat)
summary(m2)

## 
## Call:
## lm(formula = y ~ x + firmid, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.07865 -0.65881 -0.07959  0.46494  2.88741 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.73192    0.38192   12.39   <2e-16 ***
## x           -2.03341    0.05757  -35.32   <2e-16 ***
## firmidB      5.11082    0.44613   11.46   <2e-16 ***
## firmidC     10.33555    0.44970   22.98   <2e-16 ***
## firmidD     15.53807    0.47929   32.42   <2e-16 ***
## firmidE     20.05265    0.49559   40.46   <2e-16 ***
## firmidF     25.69516    0.53157   48.34   <2e-16 ***
## firmidG     30.88835    0.56215   54.95   <2e-16 ***
## firmidH     35.65842    0.58094   61.38   <2e-16 ***
## firmidI     40.67113    0.64784   62.78   <2e-16 ***
## firmidJ     45.44143    0.69661   65.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9868 on 89 degrees of freedom
## Multiple R-squared:   0.99,  Adjusted R-squared:  0.9889 
## F-statistic: 883.7 on 10 and 89 DF,  p-value: < 2.2e-16

m3 <- feols(y ~ x | firmid, data = dat)
summary(m3)

## OLS estimation, Dep. Var.: y
## Observations: 100 
## Fixed-effects: firmid: 10
## Standard-errors: Clustered (firmid) 
##   Estimate Std. Error  t value   Pr(>|t|)    
## x -2.03341   0.034524 -58.8984 5.9065e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.930974     Adj. R2: 0.988909
##                  Within R2: 0.933415

ECON 413 Regression Analysis

Linear regression

Linear regression

Linear regression

Linear regression

Linear regression

Panel data

ECON 413
Regression Analysis