###PRISM Brownbag: An Introduction to R ##By Dino Christenson & Scott Powell #Ohio State University #Accompanying information at http://polisci.osu.edu/prism/ ###Getting Started with R### ###Help Command for Linear Model Function help ("lm") ###Working Directory### setwd("K:/PRISM/Brownbags") ###Entering objects### v <- c(10,15,20) v m <- matrix(c(10,15,20,25,30,35,40,45),ncol=4) m character <- c("protestant", "catholic", "jewish") character ###Listing objects### objects () ls () ###Reading Data### south.data <- read.table("south.txt", header=TRUE) objects () names (south.data) attach (south.data) ###Browsing for Data Without the Directory### #world95.data.2<-read.spss(file.choose()) ###Loading Data from Other Spreadsheets### library(foreign) world95.data<-read.spss("World95.sav") objects () names (world95.data) attach (world95.data) senate02.data<-read.dta("Senate2002.dta") objects () names (senate02.data) attach (senate02.data) ###Matrix Manipulation in R### mat1<-matrix(c(11,21,12,22,13,23), nrow=2, ncol=3) mat1 mat2<-matrix(c(11,21,12,22,13,23), nrow=3, ncol=2) mat2 dim (mat1) ncol (mat1) nrow (mat1) mat3<-matrix (seq(1,10,1), nrow=2, ncol=5) mat3 mat4<-matrix (seq(0,5,1), nrow=2, ncol=3) mat4 mat1 mat4 mat1+mat4 mat1-mat4 mat1 mat2 mat1%*%mat2 mat1 mat3 mat1%x%mat3 mat5<-matrix (seq(0,8,1), nrow=3, ncol=3) mat5 det(mat5) solve(mat5) #will not solve bc mat5 is singular - r is smarter than us!# mat6<-matrix (c(11,21,12,22), nrow=2, ncol=2) mat6 det(mat6) solve(mat6) t(mat1) diag(1,nrow=5,ncol=5) eigen(mat6) ###Data Analysis with OLS### #Descriptive Statistics - Examples list(senate02.data) summary(senate02.data) summary(repvshr) var(repvshar) hist(income) plot(income,repvshr) #Hand-rolled OLS x<-as.matrix(cbind(int=1,income,presvote,pressup)) y<-as.vector(repvshr) i<-diag(1,nrow=nrow(x),ncol=ncol(x)) n<-length(y) p<-ncol(x)-1 xy<-t(x)%*%y # X'Y xxi<-solve(t(x)%*%x) #(X'X)^(-1) h<-x%*%xxi%*%t(x) #hat matrix of x i<-diag(1,nrow=n,ncol=n) b<-as.vector(xxi%*%xy) #estimated coefficients names(b)<-colnames(x) yhat<-as.vector(x%*%b) #predicted values for y res<-y-yhat # or (i-h)%*%y #model residuals sst<-sum((y-mean(y))^2) #Total sum of sqares sse<-t(res)%*%res # or sum(res^2) which is also t(res)%*%res ssm<-sst-sse #sum of squares for model (regression) df.e<-(n-p-1) #degrees of freedom for error df.t<-(n-1) #total degrees of freedom df.m<-df.t-df.e #degrees of freedom for model s2<-as.vector(sse/df.e) # or (t(res)%*%res)?(n-p-1) sigma2<-as.vector(sse/(n-p)) r2<-1-(sse/sst) r2.adj<-1-((sse/df.e)/(sst/df.t)) aic<-n*log(sse/n)+2*(p+1) cp<-(sse/s2)-(n-2*(p+1)) f<-(ssm/df.m)/(sse/df.e) pvalue<-1-pf(f,df.m,df.e) b.standard.errors<-sqrt(diag(xxi))*sqrt(s2) #coefficient standard errors b.t.statistic<-b/b.standard.errors #t statistic for st. errors b.t.prob<-2*(1-pt(b.t.statistic,df.e)) #alpha 0.05 #Call the Estimates b b.standard.errors #OLS using the canned R procedure (i.e. the 'lm' command) canned.ols <- lm(repvshr ~ income + presvote + pressup) summary(canned.ols) ###Diagnostics & Cool Graphs with OLS### ##Note: this material relies on John Fox's book on applied statistical models in R and his R library "car": http://cran.r-project.org/doc/packages/car.pdf library(car) #typical residual tests library(lmtest) #BP GQ HM tests library(perturb) #condition index test x<-cbind(1,income,presvote,pressup) ols.model1<-lm(formula = repvshr~income+presvote+pressup) summary(ols.model1) ##Leverage hatvalues(ols.model1) avg.mod1<-ncol(x)/nrow(x) avg.mod1 plot(hatvalues(ols.model1)) abline(h=1*(ncol(x))/nrow(x)) abline(h=2*(ncol(x))/nrow(x)) abline(h=3*(ncol(x))/nrow(x)) identify(hatvalues(ols.model1)) ##Outliers rstudent(ols.model1) plot(rstudent(ols.model1)) abline(h=c(-2,2)) identify(rstudent(ols.model1)) ##Influence: Cook's D plot(cookd(ols.model1)) abline(h=4/(nrow(x)-ncol(x))) identify(cookd(ols.model1)) ##Other measures of influence, including df-beta and df-fit influence.measures(ols.model1) ##Normal distribution test qq.plot(ols.model1,distribution="norm") plot(density(rstudent(ols.model1))) ##Nonconstant error variance data.frame(fitted.values=predict(ols.model1), residual=resid(ols.model1)) par(mfrow=c(2,2)) plot(resid(ols.model1)~fitted.values(ols.model1)) plot(resid(ols.model1)~income) plot(resid(ols.model1)~presvote) plot(resid(ols.model1)~pressup) ##4 Helpful graphs in one simple command #par(mfrow=c(2,2)) ##splits the graphing screen #plot(ols.model1, which=1) #plot(ols.model1, which=2) #plot(ols.model1, which=3) #plot(ols.model1, which=4) #or par(mfrow=c(2,2)) ##splits the graphing screen plot(ols.model1, which=1:4) ##Breusch-Pagan, Goldfeld-Quandt, and Harrison-McCabe tests bptest(ols.model1) gqtest(ols.model1) hmctest(ols.model1) ##Variance inflation factors vif(ols.model1) ##Obtain the condition index colldiag(ols.model1) ###Loops and Functions### #Loops #Simple iteration for (i in 1:10) print (i) #Loop to calculate the mean of income #Create objects sum <- 0 avg <- 0 for (i in 1:22){ sum <- sum + income[i] avg <- sum/i } #Output avg #Test against the mean command mean(income) #Functions #Designing an OLS function (based on hand-rolled ols commands above) ols<-function(y,x){ x<-as.matrix(cbind(int=1,x)) y<-as.vector(y) i<-diag(1,nrow=nrow(x),ncol=ncol(x)) n<-length(y) p<-ncol(x)-1 xy<-t(x)%*%y # X'Y xxi<-solve(t(x)%*%x) #(X'X)^(-1) h<-x%*%xxi%*%t(x) #hat matrix of x i<-diag(1,nrow=n,ncol=n) b<-as.vector(xxi%*%xy) #estimated coefficients names(b)<-colnames(x) yhat<-as.vector(x%*%b) #predicted values for y res<-y-yhat # or (i-h)%*%y #model residuals sst<-sum((y-mean(y))^2) #Total sum of sqares sse<-t(res)%*%res # or sum(res^2) which is also t(res)%*%res ssm<-sst-sse #sum of squares for model (regression) df.e<-(n-p-1) #degrees of freedom for error df.t<-(n-1) #total degrees of freedom df.m<-df.t-df.e #degrees of freedom for model s2<-as.vector(sse/df.e) # or (t(res)%*%res)?(n-p-1) sigma2<-as.vector(sse/(n-p)) r2<-1-(sse/sst) r2.adj<-1-((sse/df.e)/(sst/df.t)) aic<-n*log(sse/n)+2*(p+1) cp<-(sse/s2)-(n-2*(p+1)) f<-(ssm/df.m)/(sse/df.e) pvalue<-1-pf(f,df.m,df.e) b.standard.errors<-sqrt(diag(xxi))*sqrt(s2) #coefficient standard errors b.t.statistic<-b/b.standard.errors #t statistic for st. errors b.t.prob<-2*(1-pt(b.t.statistic,df.e)) #alpha 0.05 b.table<-cbind(est=b,b.se=b.standard.errors,t.stat=b.t.statistic,p=b.t.prob) return(list(b=b.table )) } #Output of new ols command ols(repvshr, income) #Compare to lm command lm(repvshr ~ income) ###And that's all she wrote...### ##This script and the accompanying data can be found on the web at http://polisci.osu.edu/prism/luncheons.htm