Econometrics Lab 10
setwd("~/Desktop/R WD")
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(data.table)
load("datasets/dt_wages.RData")
Part 2
Descriptive Statistics
dt.wages <- data.table(dt.wages)
stargazer(dt.wages, type = "text")
##
## ==============================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## --------------------------------------------------------------
## wage 526 5.896 3.693 0.530 3.330 6.880 24.980
## educ 526 12.563 2.769 0 12 14 18
## exper 526 17.017 13.572 1 5 26 51
## tenure 526 5.105 7.224 0 0 7 44
## nonwhite 526 0.103 0.304 0 0 0 1
## female 526 0.479 0.500 0 0 1 1
## married 526 0.608 0.489 0 0 1 1
## numdep 526 1.044 1.262 0 0 2 6
## smsa 526 0.722 0.448 0 0 1 1
## northcen 526 0.251 0.434 0 0 0.8 1
## south 526 0.356 0.479 0 0 1 1
## west 526 0.169 0.375 0 0 0 1
## construc 526 0.046 0.209 0 0 0 1
## ndurman 526 0.114 0.318 0 0 0 1
## trcommpu 526 0.044 0.205 0 0 0 1
## trade 526 0.287 0.453 0 0 1 1
## services 526 0.101 0.301 0 0 0 1
## profserv 526 0.259 0.438 0 0 1 1
## profocc 526 0.367 0.482 0 0 1 1
## clerocc 526 0.167 0.374 0 0 0 1
## servocc 526 0.141 0.348 0 0 0 1
## lwage 526 1.623 0.532 -0.635 1.203 1.929 3.218
## expersq 526 473.435 616.045 1 25 676 2,601
## tenursq 526 78.150 199.435 0 0 49 1,936
## --------------------------------------------------------------
ncol(dt.wages)
## [1] 24
There are 526 observations of 24 variables
Tables
table(dt.wages[, list(female, nonwhite)])
## nonwhite
## female 0 1
## 0 245 29
## 1 227 25
table(dt.wages[, list(female, nonwhite, south)])
## , , south = 0
##
## nonwhite
## female 0 1
## 0 158 13
## 1 154 14
##
## , , south = 1
##
## nonwhite
## female 0 1
## 0 87 16
## 1 73 11
qplot(factor(educ), data=dt.wages, geom = "bar")
qplot(wage, data=dt.wages, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(wage, data=dt.wages, geom = "histogram", binwidth=2)
Continuous vs Categorical Data
qplot(x=factor("All"), y=wage, data=dt.wages, geom = "boxplot")
qplot(x=factor(female), y=wage, data=dt.wages, geom = "boxplot")
qplot (factor(female),wage, fill=factor(nonwhite), data=dt.wages, geom="boxplot")
qplot (factor(nonwhite),wage, fill=factor(female), data=dt.wages, geom="boxplot")
ggplot(dt.wages) + geom_boxplot(aes(factor(educ), exper))
qplot(educ, wage, data=dt.wages, geom = "point")
qplot(educ, wage, color=factor(female), data=dt.wages, geom = "point")
ggplot(dt.wages) + geom_point(aes(educ, wage)) + facet_grid( ~ female)
ggplot(dt.wages) + geom_point(aes(educ, wage)) + facet_grid(nonwhite ~ female)
ggpairs(dt.wages[, list(wage, educ, exper)])
ggpairs(dt.wages[, list(female=factor(female), nonwhite=factor(nonwhite))])
ggpairs(dt.wages[, list(wage, educ, exper, female=factor(female), nonwhite=factor(nonwhite))])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Outliers
dt.wages[wage>20,][order(-wage)]
## wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1: 24.98 18 29 25 0 0 1 0 1 0 0
## 2: 22.86 16 16 7 0 0 1 2 1 0 0
## 3: 22.20 12 31 15 0 0 1 1 1 0 0
## 4: 21.86 12 24 16 0 0 1 3 1 1 0
## 5: 21.63 18 8 8 0 1 0 0 1 0 0
## west construc ndurman trcommpu trade services profserv profocc clerocc
## 1: 0 0 0 0 0 0 0 1 0
## 2: 0 0 0 0 0 0 0 1 0
## 3: 1 0 0 0 0 0 0 1 0
## 4: 0 0 0 0 1 0 0 1 0
## 5: 0 0 0 0 0 0 1 1 0
## servocc lwage expersq tenursq
## 1: 0 3.218076 841 625
## 2: 0 3.129389 256 49
## 3: 0 3.100092 961 225
## 4: 0 3.084659 576 256
## 5: 0 3.074081 64 64
qplot(lwage, data=dt.wages, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
dt.wages[lwage<0,]
## wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1: 0.53 12 3 1 0 1 0 0 1 0 0
## west construc ndurman trcommpu trade services profserv profocc clerocc
## 1: 1 0 0 0 0 1 0 0 0
## servocc lwage expersq tenursq
## 1: 1 -0.6348783 9 1
Part 3
Difference in means estimator when treatment is ‘south’
Mean wage of treated (south=1) - mean wage of untreated (south=0) with second term as approximation for counterfactual.
dt.wages[south==1, mean(wage)] - dt.wages[south==0, mean(wage)]
## [1] -0.7900927
Regression of treatment effects with race and gender as controls
If we simply regress wage on south, there is a problem of omitted variables. Clearly other factors effect wage other than whether an individual is from the south or not. Omitting variables that determine both treatment status and the outcome variable generates a correlation between the treatment variable and the error term. Therefore we should include observed covariates.
Here, we simply add race and gender variables to a regression of outcome variable (wage) on treatment variable (south).
library(matlib)
treat.reg <- lm(wage ~ south + nonwhite + female, data=dt.wages)
stargazer(treat.reg, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## wage
## -----------------------------------------------
## south -0.884***
## (0.317)
##
## nonwhite -0.372
## (0.499)
##
## female -2.552***
## (0.302)
##
## Constant 7.471***
## (0.243)
##
## -----------------------------------------------
## Observations 526
## R2 0.130
## Adjusted R2 0.125
## Residual Std. Error 3.454 (df = 522)
## F Statistic 26.104*** (df = 3; 522)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Heterogeneous treatment effects (Regression adjustment)
So far we have assumed that the treatment effects are homogenous. That is, for each individual that receives treatment, the effect of the treatment on the outcome variable is the same. In this case the difference between ATE and ATET is trivial.
It may be reasonable to assume that the treatment effects is in fact heterogeneous. In order to do so we can add interaction terms between the covariates and the treatment dummy.
RA <- lm(wage ~ south + nonwhite + female + nonwhite*south + female*south, data=dt.wages)
stargazer(RA, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## wage
## -----------------------------------------------
## south -1.288***
## (0.447)
##
## nonwhite 0.155
## (0.691)
##
## female -2.953***
## (0.374)
##
## south:nonwhite -1.047
## (0.996)
##
## south:female 1.117*
## (0.630)
##
## Constant 7.628***
## (0.269)
##
## -----------------------------------------------
## Observations 526
## R2 0.138
## Adjusted R2 0.129
## Residual Std. Error 3.446 (df = 520)
## F Statistic 16.591*** (df = 5; 520)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
south.nonwhite <- dt.wages[south==1, mean(nonwhite)] #mean wage of nonwhites in south
south.female <- dt.wages[south==1, mean(female)] #mean wage of females in south
ATE estimate
where
is the regression coefficient on the treatment dummy and the second part is the expected value of the covariate multiplied by the coefficient on the corresponding interaction variable.
summary(RA)$coefficients[2,1] + (dt.wages[, mean(nonwhite)]*summary(RA)$coefficients[5,1]) + (dt.wages[, mean(female)]*summary(RA)$coefficients[6,1])
## [1] -0.8600141
ATET estimate
where the second part is the expected value of the covariate, given the individual was treated, multiplied by the coefficient on the corresponding interaction variable.
summary(RA)$coefficients[2,1] + (south.nonwhite*summary(RA)$coefficients[5,1]) + (south.female*summary(RA)$coefficients[6,1])
## [1] -0.9370693
Two-step fitted regression
First carry out seperate regressions on the treatment group and the non-treatment group.
(The first two lines of code create subsets from the data table of treated (south.1) and untreated (south.0) groups)
NB. we don’t need to add the treatment dummy to the regressions
south.1 <- subset(dt.wages, south==1)
south.0 <- subset(dt.wages, south==0)
twostep.1 <- lm(wage ~ nonwhite + female, data=south.1)
twostep.0 <- lm(wage ~ nonwhite + female, data=south.0)
stargazer(twostep.1, twostep.0, type = "text")
##
## ==================================================================
## Dependent variable:
## ----------------------------------------------
## wage
## (1) (2)
## ------------------------------------------------------------------
## nonwhite -0.892 0.155
## (0.617) (0.739)
##
## female -1.836*** -2.953***
## (0.436) (0.400)
##
## Constant 6.341*** 7.628***
## (0.307) (0.287)
##
## ------------------------------------------------------------------
## Observations 187 339
## R2 0.096 0.139
## Adjusted R2 0.086 0.134
## Residual Std. Error 2.963 (df = 184) 3.684 (df = 336)
## F Statistic 9.724*** (df = 2; 184) 27.234*** (df = 2; 336)
## ==================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Next, using the 2 models, predict wages for our full sample
library(Metrics)
dt.wages$y_hat1 <- predict(twostep.1, newdata=dt.wages)
dt.wages$y_hat0 <- predict(twostep.0, newdata=dt.wages)
ATE estimate
mean(dt.wages$y_hat1 - dt.wages$y_hat0)
## [1] -0.8600141
ATET estimate
mean(dt.wages$south*(dt.wages$y_hat1 - dt.wages$y_hat0))/mean(dt.wages$south)
## [1] -0.9370693
Regression with experience added
exp.reg <- lm(wage ~ south + nonwhite + female + exper, data=dt.wages)
Heterogenous treatment effects using two step fitted regression
exptwostep.1 <- lm(wage ~ nonwhite + female + exper, data=south.1)
exptwostep.0 <- lm(wage ~ nonwhite + female + exper, data=south.0)
dt.wages$y_hat2 <- predict(exptwostep.1, newdata=dt.wages)
dt.wages$y_hat3 <- predict(exptwostep.0, newdata=dt.wages)
ATE estimate
mean(dt.wages$y_hat2 - dt.wages$y_hat3)
## [1] -0.8723571
ATET estimate
mean(dt.wages$south*(dt.wages$y_hat2 - dt.wages$y_hat3))/mean(dt.wages$south)
## [1] -1.006724