Econometrics Lab 5
##Setup
setwd("~/Desktop/R WD")
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(data.table)
Exercise 1
load("datasets/car.test.RData")
load("datasets/car.train.RData")
Use the data to build a model explaining the price of used cars
colnames(car.train)
## [1] "Price" "Age" "KM" "FuelType" "HP" "MetColor"
## [7] "Automatic" "CC" "Doors" "Weight"
car.train <- data.table(car.train)
Manipulate the car.train object to add squared variables
car.train <- car.train[, Age2:=Age^2] # create new var. age^2
car.train <- car.train[, HP2:=HP^2] # create new var. hp^2
car.train <- car.train[, KM2:=KM^2] # create new var. km^2
car.train <- car.train[, Weight2:=Weight^2] # create new var. weight^2
Regress price against all other variables
out1 <- lm(Price ~ . , data = car.train)
summary(step(out1)) # this selects which variables to include in the model using AIC
## Start: AIC=12191.59
## Price ~ Age + KM + FuelType + HP + MetColor + Automatic + CC +
## Doors + Weight + Age2 + HP2 + KM2 + Weight2
##
## Df Sum of Sq RSS AIC
## - HP 1 101512 1155629993 12190
## - MetColor 1 245096 1155773577 12190
## - Doors 1 362635 1155891116 12190
## <none> 1155528481 12192
## - KM 1 4804869 1160333350 12193
## - KM2 1 5614378 1161142858 12194
## - Automatic 1 6156850 1161685330 12194
## - FuelType 2 11347088 1166875569 12196
## - HP2 1 10217061 1165745542 12197
## - Weight2 1 14628686 1170157166 12200
## - CC 1 16922292 1172450773 12202
## - Weight 1 24374273 1179902754 12208
## - Age2 1 150757761 1306286241 12295
## - Age 1 502301715 1657830196 12501
##
## Step: AIC=12189.66
## Price ~ Age + KM + FuelType + MetColor + Automatic + CC + Doors +
## Weight + Age2 + HP2 + KM2 + Weight2
##
## Df Sum of Sq RSS AIC
## - MetColor 1 234907 1155864900 12188
## - Doors 1 352299 1155982291 12188
## <none> 1155629993 12190
## - KM 1 4987681 1160617674 12191
## - KM2 1 5521759 1161151752 12192
## - Automatic 1 6391686 1162021679 12192
## - Weight2 1 15009576 1170639568 12199
## - Weight 1 24683893 1180313886 12206
## - FuelType 2 29650278 1185280270 12208
## - CC 1 51344325 1206974318 12225
## - HP2 1 132673567 1288303559 12281
## - Age2 1 150830232 1306460225 12293
## - Age 1 504733611 1660363604 12500
##
## Step: AIC=12187.84
## Price ~ Age + KM + FuelType + Automatic + CC + Doors + Weight +
## Age2 + HP2 + KM2 + Weight2
##
## Df Sum of Sq RSS AIC
## - Doors 1 321577 1156186477 12186
## <none> 1155864900 12188
## - KM 1 4925321 1160790221 12190
## - KM2 1 5645757 1161510657 12190
## - Automatic 1 6301657 1162166557 12190
## - Weight2 1 14874551 1170739451 12197
## - Weight 1 24522735 1180387635 12204
## - FuelType 2 29498524 1185363424 12206
## - CC 1 51112748 1206977648 12223
## - HP2 1 132570109 1288435009 12279
## - Age2 1 151806190 1307671090 12292
## - Age 1 507773611 1663638511 12500
##
## Step: AIC=12186.08
## Price ~ Age + KM + FuelType + Automatic + CC + Weight + Age2 +
## HP2 + KM2 + Weight2
##
## Df Sum of Sq RSS AIC
## <none> 1156186477 12186
## - KM 1 5037578 1161224055 12188
## - KM2 1 5596264 1161782741 12188
## - Automatic 1 7365407 1163551884 12190
## - Weight2 1 15564105 1171750582 12196
## - Weight 1 26981390 1183167867 12204
## - FuelType 2 32649423 1188835900 12206
## - CC 1 51164720 1207351197 12221
## - HP2 1 145162799 1301349276 12286
## - Age2 1 156182032 1312368509 12293
## - Age 1 522164399 1678350876 12505
##
## Call:
## lm(formula = Price ~ Age + KM + FuelType + Automatic + CC + Weight +
## Age2 + HP2 + KM2 + Weight2, data = car.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6620.3 -690.0 -7.1 664.0 5663.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.767e+04 7.576e+03 -2.333 0.019889 *
## Age -2.478e+02 1.265e+01 -19.593 < 2e-16 ***
## KM -7.586e-03 3.942e-03 -1.924 0.054632 .
## FuelTypeDiesel 2.477e+03 5.362e+02 4.620 4.44e-06 ***
## FuelTypePetrol 9.218e+02 4.250e+02 2.169 0.030343 *
## Automatic 4.143e+02 1.780e+02 2.327 0.020200 *
## CC -3.380e+00 5.512e-01 -6.133 1.32e-09 ***
## Weight 5.637e+01 1.266e+01 4.454 9.56e-06 ***
## Age2 1.241e+00 1.158e-01 10.715 < 2e-16 ***
## HP2 2.327e-01 2.252e-02 10.331 < 2e-16 ***
## KM2 -4.005e-08 1.974e-08 -2.028 0.042835 *
## Weight2 -1.735e-02 5.129e-03 -3.383 0.000751 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1166 on 850 degrees of freedom
## Multiple R-squared: 0.8912, Adjusted R-squared: 0.8898
## F-statistic: 633.2 on 11 and 850 DF, p-value: < 2.2e-16
Take preferred model from previous step and run usual commands
out <- lm( Price ~ Age + Age2 + KM + KM2 + HP + HP2 + Weight + Weight2 + FuelType + Automatic + CC
, data = car.train)
stargazer(out, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## Price
## -----------------------------------------------
## Age -248.050***
## (12.706)
##
## Age2 1.243***
## (0.116)
##
## KM -0.007*
## (0.004)
##
## KM2 -0.00000**
## (0.00000)
##
## HP -6.844
## (27.792)
##
## HP2 0.255***
## (0.093)
##
## Weight 56.100***
## (12.709)
##
## Weight2 -0.017***
## (0.005)
##
## FuelTypeDiesel 2,317.297***
## (842.180)
##
## FuelTypePetrol 918.760**
## (425.368)
##
## Automatic 409.799**
## (179.082)
##
## CC -3.203***
## (0.907)
##
## Constant -17,348.070**
## (7,693.953)
##
## -----------------------------------------------
## Observations 862
## R2 0.891
## Adjusted R2 0.890
## Residual Std. Error 1,166.929 (df = 849)
## F Statistic 579.810*** (df = 12; 849)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
out2 <- lm( Price ~ MetColor + Doors + Age + Age2 + KM + KM2 + HP + HP2 + Weight + Weight2 + FuelType + Automatic + CC
, data = car.train)
stargazer(out, out2, type = "text") #Can see these two variables add nothing to the model and reduce F-stat
##
## =======================================================================
## Dependent variable:
## ---------------------------------------------------
## Price
## (1) (2)
## -----------------------------------------------------------------------
## MetColor 36.492
## (86.095)
##
## Doors -26.311
## (51.033)
##
## Age -248.050*** -246.858***
## (12.706) (12.865)
##
## Age2 1.243*** 1.233***
## (0.116) (0.117)
##
## KM -0.007* -0.007*
## (0.004) (0.004)
##
## KM2 -0.00000** -0.00000**
## (0.00000) (0.00000)
##
## HP -6.844 -7.595
## (27.792) (27.843)
##
## HP2 0.255*** 0.254***
## (0.093) (0.093)
##
## Weight 56.100*** 59.215***
## (12.709) (14.009)
##
## Weight2 -0.017*** -0.018***
## (0.005) (0.006)
##
## FuelTypeDiesel 2,317.297*** 2,266.370***
## (842.180) (848.706)
##
## FuelTypePetrol 918.760** 940.408**
## (425.368) (428.126)
##
## Automatic 409.799** 391.723**
## (179.082) (184.395)
##
## CC -3.203*** -3.200***
## (0.907) (0.908)
##
## Constant -17,348.070** -19,255.510**
## (7,693.953) (8,505.219)
##
## -----------------------------------------------------------------------
## Observations 862 862
## R2 0.891 0.891
## Adjusted R2 0.890 0.890
## Residual Std. Error 1,166.929 (df = 849) 1,168.016 (df = 847)
## F Statistic 579.810*** (df = 12; 849) 496.086*** (df = 14; 847)
## =======================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Apply data table to test data set and create squared variables
car.test <- data.table(car.test)
car.test <- car.test[, Age2:=Age^2]
car.test <- car.test[, HP2:=HP^2]
car.test <- car.test[, Weight2:=Weight^2]
car.test <- car.test[, KM2:=KM^2]
Define car.test$y_hat which uses model from first data set to predict values for second data set
car.test$y_hat <- predict(out1, newdata=car.test)
colnames(car.test)
## [1] "Price" "Age" "KM" "FuelType" "HP" "MetColor"
## [7] "Automatic" "CC" "Doors" "Weight" "Age2" "HP2"
## [13] "Weight2" "KM2" "y_hat"
head(car.test)
## Price Age KM FuelType HP MetColor Automatic CC Doors Weight Age2
## 1: 11290 49 80320 Petrol 110 1 1 1600 3 1070 2401
## 2: 15950 19 51884 Petrol 97 1 0 1400 3 1100 361
## 3: 8500 80 100458 Petrol 110 0 0 1600 5 1085 6400
## 4: 8900 67 54847 Petrol 110 0 0 1600 3 1050 4489
## 5: 15950 28 29206 Petrol 97 1 0 1400 5 1110 784
## 6: 15950 30 67660 Petrol 110 1 0 1600 3 1105 900
## HP2 Weight2 KM2 y_hat
## 1: 12100 1144900 6451302400 11514.079
## 2: 9409 1210000 2691949456 17013.095
## 3: 12100 1177225 10091809764 8309.281
## 4: 12100 1102500 3008193409 9141.654
## 5: 9409 1232100 852990436 15691.126
## 6: 12100 1221025 4577875600 14807.865
Our model does a reasonable job of predicting price
Use the RMSE to compare the performance of your model in carTrain.RData and carTest.RData
library(Metrics)
car.train$y_hat <- predict(out1, newdata=car.train)
head(car.train)
## Price Age KM FuelType HP MetColor Automatic CC Doors Weight Age2 HP2
## 1: 13750 23 72937 Diesel 90 1 0 2000 3 1165 529 8100
## 2: 13950 24 41711 Diesel 90 1 0 2000 3 1165 576 8100
## 3: 13750 30 38500 Diesel 90 0 0 2000 3 1170 900 8100
## 4: 12950 32 61000 Diesel 90 0 0 2000 3 1170 1024 8100
## 5: 16900 27 94612 Diesel 90 1 0 2000 3 1245 729 8100
## 6: 18600 30 75889 Diesel 90 1 0 2000 3 1245 900 8100
## KM2 Weight2 y_hat
## 1: 5319805969 1357225 16242.52
## 2: 1739807521 1357225 16432.39
## 3: 1482250000 1368900 15430.64
## 4: 3721000000 1368900 14830.84
## 5: 8951430544 1550025 16391.52
## 6: 5759140321 1550025 16131.60
Define residual and residual^2
car.train$u_hat <- car.train$y_hat - car.train$Price
car.train$u_hat2 <- (car.train$u_hat)^2
head(car.train)
## Price Age KM FuelType HP MetColor Automatic CC Doors Weight Age2 HP2
## 1: 13750 23 72937 Diesel 90 1 0 2000 3 1165 529 8100
## 2: 13950 24 41711 Diesel 90 1 0 2000 3 1165 576 8100
## 3: 13750 30 38500 Diesel 90 0 0 2000 3 1170 900 8100
## 4: 12950 32 61000 Diesel 90 0 0 2000 3 1170 1024 8100
## 5: 16900 27 94612 Diesel 90 1 0 2000 3 1245 729 8100
## 6: 18600 30 75889 Diesel 90 1 0 2000 3 1245 900 8100
## KM2 Weight2 y_hat u_hat u_hat2
## 1: 5319805969 1357225 16242.52 2492.5194 6212653.1
## 2: 1739807521 1357225 16432.39 2482.3940 6162280.1
## 3: 1482250000 1368900 15430.64 1680.6424 2824558.9
## 4: 3721000000 1368900 14830.84 1880.8411 3537563.1
## 5: 8951430544 1550025 16391.52 -508.4822 258554.2
## 6: 5759140321 1550025 16131.60 -2468.3975 6092986.2
Root mean squared error
rmse.train <- sqrt(sum(car.train$u_hat2)/nrow(car.train))
rmse.train
## [1] 1157.808
Alternatively use function rmse:
rmse.train.2 <- rmse(car.train$Price, car.train$y_hat)
rmse.train.2
## [1] 1157.808
On the other data set, define residuals
car.test$u_hat <- car.test$y_hat - car.test$Price
car.test$u_hat2 <- (car.test$u_hat)^2
head(car.test)
## Price Age KM FuelType HP MetColor Automatic CC Doors Weight Age2
## 1: 11290 49 80320 Petrol 110 1 1 1600 3 1070 2401
## 2: 15950 19 51884 Petrol 97 1 0 1400 3 1100 361
## 3: 8500 80 100458 Petrol 110 0 0 1600 5 1085 6400
## 4: 8900 67 54847 Petrol 110 0 0 1600 3 1050 4489
## 5: 15950 28 29206 Petrol 97 1 0 1400 5 1110 784
## 6: 15950 30 67660 Petrol 110 1 0 1600 3 1105 900
## HP2 Weight2 KM2 y_hat u_hat u_hat2
## 1: 12100 1144900 6451302400 11514.079 224.0795 50211.60
## 2: 9409 1210000 2691949456 17013.095 1063.0951 1130171.14
## 3: 12100 1177225 10091809764 8309.281 -190.7189 36373.72
## 4: 12100 1102500 3008193409 9141.654 241.6545 58396.88
## 5: 9409 1232100 852990436 15691.126 -258.8741 67015.81
## 6: 12100 1221025 4577875600 14807.865 -1142.1354 1304473.28
rmse.test <- rmse(car.test$Price, car.test$y_hat)
rmse.test
## [1] 1312.619
Model 1 - compound depreciation
summary(out6a <- lm( log(Price) ~ Age, data=car.train))
##
## Call:
## lm(formula = log(Price) ~ Age, data = car.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.83085 -0.07564 0.00464 0.09004 0.45757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.0066369 0.0149195 670.71 <2e-16 ***
## Age -0.0138416 0.0002534 -54.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1378 on 860 degrees of freedom
## Multiple R-squared: 0.7762, Adjusted R-squared: 0.7759
## F-statistic: 2983 on 1 and 860 DF, p-value: < 2.2e-16
coeffs.out6a <- coefficients(out6a)
logValue <- coeffs.out6a[1]
Value <- exp(logValue)
delta <- 1 - exp(coeffs.out6a[2])
Value
## (Intercept)
## 22173.14
delta
## Age
## 0.01374627
Model 2 - linear depreciation
summary(out6b <- lm( Price ~ Age, data=car.train))
##
## Call:
## lm(formula = Price ~ Age, data = car.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6234.2 -942.0 68.6 832.5 11888.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20056.425 179.011 112.04 <2e-16 ***
## Age -167.361 3.041 -55.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1653 on 860 degrees of freedom
## Multiple R-squared: 0.7789, Adjusted R-squared: 0.7786
## F-statistic: 3029 on 1 and 860 DF, p-value: < 2.2e-16
coeffs.out6b <- coefficients(out6b)
Value2 <- coeffs.out6b[1]
alpha <- coeffs.out6b[2]/Value2
Value2
## (Intercept)
## 20056.42
alpha
## Age
## -0.008344515
Which model is better? Compare R^2 But first model is log y so must convert to y before getting R^2
summary(out6a)$sigma
## [1] 0.1377964
gamma <- exp((summary(out6a)$sigma)^2/2)
car.train$logyhat <- predict(out6a, newdata= car.train)
car.train$yhat <- gamma*exp(car.train$logyhat)
cor(car.train$yhat, car.train$Price)^2
## [1] 0.8116621
summary(out6b)$r.squared
## [1] 0.7788599