##Setup

Set working directory and activate necessary packages

setwd("~/Desktop/R WD")

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(data.table)
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(sandwich)

Violation of Simple Linear Regression Assumption 2

Create data

set.seed(1984)
x1 <- rnorm(n = 10000, mean = 0 , sd = 3) # create indep. var. 1
x2 <- rnorm(n = 10000, mean = 0, sd = 4) # create indep. var. 2
e <- rnorm(n = 10000, mean = 0, sd = 2) # create error
y <- 2 + 3*x1 + 4*x2 + e # create y according to population model

Define dataframe

dt.population <- data.table( y, x1, x2, e) # creates tables
dt.population <- dt.population[order(y)]
dt.population # shows first and last entries of table

##                y         x1         x2          e
##     1: -65.51931 -8.9425248 -10.925706  3.0110909
##     2: -64.08313  0.3982382 -16.079049 -2.9616476
##     3: -62.40373 -7.5047649 -10.113220 -1.4365531
##     4: -58.28314 -3.0228161 -12.841153  0.1499192
##     5: -57.83996 -6.7402440  -9.934788  0.1199243
##    ---                                           
##  9996:  63.23389  5.1128133  10.935504  2.1534348
##  9997:  63.99458  7.7187598  10.025892 -1.2652619
##  9998:  67.18546  6.1780560  11.427169  0.9426169
##  9999:  67.34564  5.1032742  11.854979  2.6158987
## 10000:  67.38101  3.9659545  13.253237  0.4701935

Extract a random sample

r.sample.rows <- sample(1:nrow(dt.population), size = 100)
r.sample.rows # shows the vector of 100 randomly selected row numbers

##   [1] 9312 6165 6458 6477 5188  406 2820  572  652 5549 9072 3968 2815 7752 4184
##  [16] 9956 3045 6106 1921 3925 6653 9426 9697 8545 7052 2314 8746 5711 4654 7588
##  [31] 3315 4286 2542 4340 4799  186 2661 4487 5296 2770 4991 8833 5568 4317 7128
##  [46] 3859 5196 1072 6460 3063 8471 2182 7634  727 8534 8819  597 4368 4776 4845
##  [61] 4483 9155 3458 4571 7679 4408 2644 4230 7691 1020 4918 4983 6298 5278 3204
##  [76]  976  758 8705 2609 4244 6084 5799 5596 9210 8573 6431 7388  388 6286 7904
##  [91] 2573 3262 6379 3460 8385 9613 8025 7927 4496 2157

r.sample <- dt.population[r.sample.rows,] # select the rows according to random sample
head(r.sample) # show selection (note that the row numbers are the ones that were randomly selected above

##             y         x1        x2          e
## 1:  29.432379 -2.4933338  8.455335  1.0910400
## 2:   7.430384 -1.7219022  3.029325 -1.5212113
## 3:   8.966529  0.9625825  1.160981 -0.5651421
## 4:   9.036588 -1.4417106  2.091823  2.9944287
## 5:   3.144875  1.9221459 -0.930215 -0.9007030
## 6: -30.386939 -0.6415410 -7.397989 -0.8703582

tail(r.sample)

##              y         x1         x2         e
## 1:  19.9215164  3.4279127  1.1952087  2.856943
## 2:  34.9533953  0.6946097  6.9837498  2.934567
## 3:  17.5075346 -3.5215802  5.9829736  2.140381
## 4:  16.9222357  4.2822281  0.7916681 -1.091121
## 5:   0.1280741 -0.9962701  0.3329119 -0.214763
## 6: -11.9797443 -0.4942824 -2.7167980 -1.629705

Run regression on random sample

summary(lm( y ~ x1 + x2, data = r.sample))

## 
## Call:
## lm(formula = y ~ x1 + x2, data = r.sample)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1047 -1.4439 -0.0498  1.1808  5.9282 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.90817    0.20551   9.285 4.69e-15 ***
## x1           3.00848    0.07517  40.025  < 2e-16 ***
## x2           4.09883    0.05554  73.796  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.049 on 97 degrees of freedom
## Multiple R-squared:  0.9841, Adjusted R-squared:  0.9838 
## F-statistic:  3008 on 2 and 97 DF,  p-value: < 2.2e-16

Extract a non-random sample

nr.sample <- dt.population[1:100,] # select a subset of the first 100 rows
head(nr.sample)

##            y         x1         x2          e
## 1: -65.51931 -8.9425248 -10.925706  3.0110909
## 2: -64.08313  0.3982382 -16.079049 -2.9616476
## 3: -62.40373 -7.5047649 -10.113220 -1.4365531
## 4: -58.28314 -3.0228161 -12.841153  0.1499192
## 5: -57.83996 -6.7402440  -9.934788  0.1199243
## 6: -57.68156 -5.4035347 -10.977426  0.4387483

Run regression on non-random sample

summary(lm( y ~ x1 + x2, data = nr.sample))

## 
## Call:
## lm(formula = y ~ x1 + x2, data = nr.sample)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4988 -1.2392 -0.0186  1.2096  3.9580 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.7973     1.5566  -3.724 0.000329 ***
## x1            2.5723     0.1078  23.868  < 2e-16 ***
## x2            3.4089     0.1294  26.352  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.744 on 97 degrees of freedom
## Multiple R-squared:  0.8814, Adjusted R-squared:  0.8789 
## F-statistic: 360.3 on 2 and 97 DF,  p-value: < 2.2e-16

###Violation of Simple Linear Regression Assumption 3 SLR.3: The sample outcomes x ,xi, i = 1,. . . ,n are not all the same value.

Colinearity: The variable x2 is a constant.

set.seed(1984)
x1 <- rnorm(n = 1000, mean = 0 , sd = 2) # create indep. var. 1
x2 <- rep(3, times=1000) # create indep. var. 2 - a constant
e <- rnorm(n = 1000, mean = 0, sd = 1) # create error
y <- 2 + 3*x1 + 4*x2 + e # create y according to population model

Run regression including constant

summary(lm( y ~ x1 + x2 ))

## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3114 -0.6618  0.0197  0.6507  3.6707 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.98124    0.03209   435.7   <2e-16 ***
## x1           2.99665    0.01620   185.0   <2e-16 ***
## x2                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.014 on 998 degrees of freedom
## Multiple R-squared:  0.9717, Adjusted R-squared:  0.9716 
## F-statistic: 3.423e+04 on 1 and 998 DF,  p-value: < 2.2e-16

Very high colinearity: x1 and x2 are very highly correlated.

set.seed(1984)
x1 <- rnorm(n = 1000, mean = 0 , sd = 2) # create indep. var. 1
x2 <- 0.4*x1 + rnorm(n=1000, mean = 0, sd =0.01) # create indep. var. 2 - mean = x1
e <- rnorm(n = 1000, mean = 0, sd = 1) # create error
y <- 2 + 3*x1 + 4*x2 + e # create y according to population model
summary(lm( y ~ x1 + x2 ))

## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.84702 -0.68167 -0.01766  0.69449  2.81105 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.02596    0.03151  64.300   <2e-16 ***
## x1           2.18183    1.24299   1.755   0.0795 .  
## x2           6.02423    3.10749   1.939   0.0528 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9954 on 997 degrees of freedom
## Multiple R-squared:  0.9882, Adjusted R-squared:  0.9882 
## F-statistic: 4.169e+04 on 2 and 997 DF,  p-value: < 2.2e-16

###Violation of Simple Linear Regression Assumption 4

SLR.4: E(u|x)=0 (the error u has an expected value of zero given any value of the explanatory variable)

set.seed(1984)
x1 <- rnorm(n = 1000, mean = 0 , sd = 3) # create indep. var. 1
x2 <- rnorm(n = 1000, mean = x1 , sd = 5) # create indep. var. 2
e <- rnorm(n = 1000, mean = 0, sd = 1)

plot(x1,x2) # plot x1 against x2

cor.test(x = x1, y = x2)

y <- 2 + 3*x1 + 4*x2 + e # create y according to population model
out.y.full <- lm ( y ~ x1 + x2) # full model
out.y.x1.om <- lm ( y ~ x1) # model with x2 ommitted
stargazer(out.y.full, out.y.x1.om, type="text")

## 
## ===========================================================================
##                                       Dependent variable:                  
##                     -------------------------------------------------------
##                                                y                           
##                                 (1)                         (2)            
## ---------------------------------------------------------------------------
## x1                            2.990***                    6.950***         
##                               (0.012)                     (0.216)          
##                                                                            
## x2                            4.004***                                     
##                               (0.006)                                      
##                                                                            
## Constant                      2.026***                    1.650**          
##                               (0.032)                     (0.643)          
##                                                                            
## ---------------------------------------------------------------------------
## Observations                   1,000                       1,000           
## R2                             0.999                       0.508           
## Adjusted R2                    0.999                       0.508           
## Residual Std. Error       0.995 (df = 997)           20.324 (df = 998)     
## F Statistic         422,429.000*** (df = 2; 997) 1,030.926*** (df = 1; 998)
## ===========================================================================
## Note:                                           *p<0.1; **p<0.05; ***p<0.01

If x1 and x2 are negatively correlated and 2 is negative, then if we omit x2 from the model beta1 will be positively biased.

set.seed(1984)
x1 <- rnorm(n = 1000, mean = 0 , sd = 3) # create indep. var. 1
x2 <- rnorm(n = 1000, mean = -x1 , sd = 5) # create indep. var. 2
plot(x1,x2) # plot x1 against x2

seed6-1

cor.test(x = x1, y = x2)

## 
##  Pearson's product-moment correlation
## 
## data:  x1 and x2
## t = -18.728, df = 998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5544216 -0.4625868
## sample estimates:
##        cor 
## -0.5099558

e <- rnorm(n = 1000, mean = 0, sd = 1) # create error
y <- 2 + 3*x1 - 4*x2 + e # create y according to population model
out.y.full <- lm ( y ~ x1 + x2) # full model
out.y.x1.om <- lm ( y ~ x1) # model with x2 ommitted
stargazer(out.y.full, out.y.x1.om, type="text")

## 
## ===========================================================================
##                                       Dependent variable:                  
##                     -------------------------------------------------------
##                                                y                           
##                                 (1)                         (2)            
## ---------------------------------------------------------------------------
## x1                            2.998***                    7.039***         
##                               (0.012)                     (0.216)          
##                                                                            
## x2                           -3.996***                                     
##                               (0.006)                                      
##                                                                            
## Constant                      2.026***                    2.401***         
##                               (0.032)                     (0.642)          
##                                                                            
## ---------------------------------------------------------------------------
## Observations                   1,000                       1,000           
## R2                             0.999                       0.516           
## Adjusted R2                    0.999                       0.515           
## Residual Std. Error       0.995 (df = 997)           20.283 (df = 998)     
## F Statistic         427,153.100*** (df = 2; 997) 1,061.892*** (df = 1; 998)
## ===========================================================================
## Note:                                           *p<0.1; **p<0.05; ***p<0.01

If x1 and x2 are positively correlated and 2 is negative, then if we omit x2 from the model betaˆ1 will be negatively biased.

set.seed(1984)
x1 <- rnorm(n = 1000, mean = 0 , sd = 3) # create indep. var. 1
x2 <- rnorm(n = 1000, mean = x1/3 , sd = 5)
cor.test(x = x1, y = x2)

## 
##  Pearson's product-moment correlation
## 
## data:  x1 and x2
## t = 5.9669, df = 998, p-value = 3.355e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1250425 0.2447737
## sample estimates:
##       cor 
## 0.1855969

e <- rnorm(n = 1000, mean = 0, sd = 1) # create error
y <- 2 + 3*x1 - 4*x2 + e # create y according to population model
out.y.full <- lm ( y ~ x1 + x2) # full model
out.y.x1.om <- lm ( y ~ x1) # model with x2 ommitted
stargazer(out.y.full, out.y.x1.om, type="text")

## 
## ========================================================================
##                                     Dependent variable:                 
##                     ----------------------------------------------------
##                                              y                          
##                                 (1)                        (2)          
## ------------------------------------------------------------------------
## x1                            2.993***                  1.706***        
##                               (0.011)                    (0.216)        
##                                                                         
## x2                           -3.996***                                  
##                               (0.006)                                   
##                                                                         
## Constant                      2.026***                  2.401***        
##                               (0.032)                    (0.642)        
##                                                                         
## ------------------------------------------------------------------------
## Observations                   1,000                      1,000         
## R2                             0.998                      0.059         
## Adjusted R2                    0.998                      0.058         
## Residual Std. Error       0.995 (df = 997)          20.283 (df = 998)   
## F Statistic         219,639.600*** (df = 2; 997) 62.351*** (df = 1; 998)
## ========================================================================
## Note:                                        *p<0.1; **p<0.05; ***p<0.01

Compute the bias

out.y.full <- lm ( y ~ x1 + x2)
coeffs.full <- coefficients(out.y.full)
b2_hat <- coeffs.full[3]
b1_hat <- coeffs.full[2]

out.part.x2 <- lm ( x2 ~ x1)
coeffs.part <- coefficients(out.part.x2)
delta <- coeffs.part[2]

bias <- delta*b2_hat
bias

##        x1 
## -1.287344