# Installing package
install.packages("wooldridge", dependencies=TRUE, repos="http://cran.rstudio.com/")
## Installing package into 'C:/Users/juanc/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'wooldridge' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\juanc\AppData\Local\Temp\RtmpwzWc5x\downloaded_packages
install.packages("AER", dependencies=TRUE, repos="http://cran.rstudio.com/")
## Installing package into 'C:/Users/juanc/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'AER' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\juanc\AppData\Local\Temp\RtmpwzWc5x\downloaded_packages
# Call package
library(AER)
## Warning: package 'AER' was built under R version 4.3.1
## Loading required package: car
## Warning: package 'car' was built under R version 4.3.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.1
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.3.1
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.1
## Loading required package: survival
library(wooldridge)
## Warning: package 'wooldridge' was built under R version 4.3.1
# Load data
data("card")

# Iv regression
iv <- ivreg(log(wage) ~ educ + exper + exper*exper 
            + black + smsa+ south | nearc4 + exper
            + exper*exper + black + smsa + south,
            data = card)
# The instrument here is living near university (nearc4) 
# and educ is the instrumented variables that is suspected to be endogenous

# Results
summary(iv)
## 
## Call:
## ivreg(formula = log(wage) ~ educ + exper + exper * exper + black + 
##     smsa + south | nearc4 + exper + exper * exper + black + smsa + 
##     south, data = card)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8135 -0.2418  0.0249  0.2553  1.3920 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.93982    0.83176   4.737 2.27e-06 ***
## educ         0.13185    0.04957   2.660  0.00786 ** 
## exper        0.06227    0.01969   3.163  0.00158 ** 
## black       -0.12960    0.05326  -2.433  0.01502 *  
## smsa         0.13483    0.03029   4.451 8.86e-06 ***
## south       -0.10925    0.02318  -4.714 2.54e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3938 on 3004 degrees of freedom
## Multiple R-Squared: 0.214,   Adjusted R-squared: 0.2127 
## Wald test: 134.4 on 5 and 3004 DF,  p-value: < 2.2e-16

Ordinary least squares in two sages

Another way to get the coefficients from instrumental variables is to run two regressions, this method is called ordinary least squares in two stages and it is useful to provide an intuition behind instrumental variable method. In the first stage we get the exogenous variation from the endogenous variable (given that our instrument is a valid one). In the second stage we use the exogenous variation to estimate the causal relationship.

# Installing package
install.packages("wooldridge", dependencies=TRUE, repos="http://cran.rstudio.com/")
## Warning: package 'wooldridge' is in use and will not be installed
# Call package
library(wooldridge)

### First stage
ols1=lm(educ~nearc4 + exper
        + exper*exper + black + smsa+ south,data=card)
# In the left side: endogenous variable 
# In the right side: all exogenous variables

# Save fitted values i.e. the exogenous variation in educ
y1hat=fitted.values(ols1)

# First stage results
summary(ols1)
## 
## Call:
## lm(formula = educ ~ nearc4 + exper + exper * exper + black + 
##     smsa + south, data = card)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6171 -1.3861 -0.1034  1.2774  6.2553 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.601545   0.119369 139.077  < 2e-16 ***
## nearc4       0.337368   0.082489   4.090 4.43e-05 ***
## exper       -0.395563   0.008719 -45.370  < 2e-16 ***
## black       -1.006665   0.089626 -11.232  < 2e-16 ***
## smsa         0.402811   0.084842   4.748 2.15e-06 ***
## south       -0.290110   0.079155  -3.665 0.000252 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.942 on 3004 degrees of freedom
## Multiple R-squared:  0.4744, Adjusted R-squared:  0.4736 
## F-statistic: 542.3 on 5 and 3004 DF,  p-value: < 2.2e-16
### Second stage 
ols2=lm(log(wage)~ y1hat + exper
        + exper*exper + black + smsa+ south,data=card)

# Second stage results
summary(ols2)
## 
## Call:
## lm(formula = log(wage) ~ y1hat + exper + exper * exper + black + 
##     smsa + south, data = card)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.66288 -0.24916  0.01312  0.27211  1.37734 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.93982    0.85169   4.626 3.89e-06 ***
## y1hat        0.13185    0.05076   2.597  0.00944 ** 
## exper        0.06227    0.02016   3.089  0.00203 ** 
## black       -0.12960    0.05454  -2.376  0.01755 *  
## smsa         0.13483    0.03102   4.347 1.43e-05 ***
## south       -0.10925    0.02373  -4.604 4.32e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4032 on 3004 degrees of freedom
## Multiple R-squared:  0.1759, Adjusted R-squared:  0.1745 
## F-statistic: 128.2 on 5 and 3004 DF,  p-value: < 2.2e-16

It is important to remark that doing instrumental variables in two stages with the previous code generate wrong standard errors. The logic behind this is because when we calculate standards errors we need to account for all the variability of \(x\) not only the exogenous variability provided by the instrument. The next code shows how to correct standards errors, however the best way to proceed is just to use a package like AER and use the iverg command.

# Installing package
install.packages("wooldridge", dependencies=TRUE, repos="http://cran.rstudio.com/")
## Warning: package 'wooldridge' is in use and will not be installed
install.packages("AER", dependencies=TRUE, repos="http://cran.rstudio.com/")
## Warning: package 'AER' is in use and will not be installed
# Call package
library(wooldridge)
library(AER)

# Instumental variables using ivreg
iv <- ivreg(log(wage) ~ educ | nearc4,
            data = card)

# Instrumental variables using two regressions
### First stage
ols1<-lm(educ~nearc4,data=card)
y1hat=fitted.values(ols1)

### Second stage 
ols2=lm(log(wage)~ y1hat,data=card)

# Correct residuals
sum((resid(iv))^2)
## [1] 932.7532
# Wrong residuals
sum((resid(ols2))^2)
## [1] 576.7756
# Calculating correct residuals
attach(card)
correct_residuals<- sum((lwage-(summary(ols2)$coefficients[1,1] +
                                   summary(ols2)$coefficients[2,1]*educ))^2)
correct_residuals
## [1] 932.7532
# Using correct residuals to calculate standard errors 
num<-correct_residuals/(nrow(card)-length(summary(ols2)$coefficients[,1])) # numerator
den<-sum((y1hat-mean(y1hat))^2)*(1-summary(lm(y1hat~1,data=card))$r.squared) # denominator

correctse <- sqrt(num)/sqrt(den)
# Comparing standard errors for education 
correctse
## [1] 0.02629134
summary(iv)$coefficients[,2]
## (Intercept)        educ 
##  0.34886175  0.02629134

The above code makes use of the error standard formula:

\[ SE_{\beta_1}=\dfrac{\dfrac{\sum_{i}^n (y-\hat{\beta_0}+\hat{\beta_1}x_1)^2}{n-k}}{\sum_i^n (\hat{y}-\hat{\bar{y}})^2 (1-R_{{i,j}})} \]

Where \(R_{i,j}\) represents the \(R^2\) from a regression of variable \(X_i\) on all other regressors.