Can anybody lead me with a small simulation? Should I calculate prediction error rate from a classification table while splitting the data into two parts then fitting on the training data and predicting on the test data and see the misclassification (i.e. how many 1 predicted as 0 and 0 as 1, calculating the rate of misclassification).
I tried the following R code:
# prediction error rate for logit and probit for different sample sizes and sigma:
mu <- rep(0,4)
library(MASS)
pred.err <- function(sam.size,sigma,cut.point.latent){
d <- mvrnorm(sam.size,mu,sigma)
obs <- ifelse(d[ ,1]>cut.point.latent,1,0) # cut point for y*(latent) to be y(observed)
x1 <- d[ ,2]
x2 <- d[ ,3]
x3 <- d[ ,4]
dat <- data.frame(obs,x1,x2,x3)
sam.select <- sample(1:nrow(dat),size=nrow(dat)*0.6) # 60% training data
train.dat <- dat[sam.select,]
test.dat <- dat[-sam.select,]
fit.logit.train <- with(train.dat,glm(obs ~ x1+x2+x3,family=binomial(link="logit")))
fit.probit.train <- with(train.dat,glm(obs ~ x1+x2+x3,family=binomial(link="probit")))
test.covar <- test.dat[ ,2:4]
pred.prob.logit <-predict(fit.logit.train,newdata=test.covar,type="response")
pred.prob.probit <-predict(fit.probit.train,newdata=test.covar,type="response")
obs.res <- test.dat[ , 1]
cut.point.prob <- pnorm(cut.point.latent) # as cut point should be same for logit and probit
pred.res.logit <- ifelse(pred.prob.logit >= cut.point.prob,1,0)
pred.res.probit <- ifelse(pred.prob.probit >= cut.point.prob,1,0)
logit.obs.pred <- data.frame(obs.res,pred.res.logit) # as in pred.res the columns
#are different from obs.res
probit.obs.pred <- data.frame(obs.res, pred.res.probit)
tab.logit <- table(logit.obs.pred)
tab.probit <- table(probit.obs.pred)
logit.pred.err.rate <- sum(tab.logit[1,2]+tab[2,1])/sum(tab.logit)
probit.pred.err.rate <- sum(tab.probit[1,2]+tab[2,1])/sum(tab.probit)
list(logit.pred.err.rate,probit.pred.err.rate)
}
pred.err.logit <-NULL
pred.err.probit <-NULL
result <- function(rep.no,sam.size,sigma,cut.point.latent){
for(i in 1:rep.no){
sam <-pred.err(sam.size,sigma,cut.point.latent)
pred.err.logit[i] <- sam[[1]]
pred.err.probit[i] <- sam[[2]]
}
list(pred.err.logit,pred.err.probit)
}
sigma.high <-matrix(c(1,.41,.5,-.7,.41,1,0,0,.5,0,1,0,-.7,0,0,1),nrow=4)
output <- result(rep.no=1000,sam.size=500,sigma=sigma.high,cut.point.latent=.53)
### findings :
t.test(output[[1]],output[[2]])