I have the following dataframe on which I did logistic regression with response as outcome. There are some good predictors in these variables so I expected significant variables.
structure(list(response = c(0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L,
0L, 0L, 1L, 0L, 1L, 0L), HIST1H3F_rna = c(1.09861228866811, 0.693147180559945,
2.07944154167984, 1.09861228866811, 1.79175946922805, 0, 0, 0,
2.39789527279837, 1.38629436111989, 1.6094379124341, 1.6094379124341,
0.693147180559945, 1.79175946922805, 0), NCF1_rna = c(2.77258872223978,
3.09104245335832, 2.63905732961526, 2.19722457733622, 2.30258509299405,
2.56494935746154, 3.09104245335832, 3.98898404656427, 2.56494935746154,
4.06044301054642, 3.87120101090789, 2.07944154167984, 3.49650756146648,
3.17805383034795, 3.95124371858143), WDR66_rna = c(5.06890420222023,
4.49980967033027, 5.11799381241676, 3.40119738166216, 3.25809653802148,
4.02535169073515, 5.8348107370626, 5.89440283426485, 3.87120101090789,
5.67675380226828, 5.35185813347607, 4.15888308335967, 6.23441072571837,
5.91889385427315, 3.68887945411394), PTH2R_rna = c(0.693147180559945,
5.08759633523238, 0.693147180559945, 1.09861228866811, 0, 6.01126717440416,
6.56526497003536, 5.18178355029209, 0, 4.36944785246702, 2.19722457733622,
1.09861228866811, 3.49650756146648, 1.38629436111989, 5.93753620508243
), HAVCR2_rna = c(4.48863636973214, 3.40119738166216, 3.09104245335832,
2.94443897916644, 3.2188758248682, 3.76120011569356, 3.95124371858143,
2.83321334405622, 2.07944154167984, 4.36944785246702, 3.58351893845611,
1.94591014905531, 4.23410650459726, 3.43398720448515, 2.56494935746154
), CD200R1_rna = c(2.484906649788, 2.94443897916644, 0.693147180559945,
1.94591014905531, 0.693147180559945, 2.89037175789616, 2.56494935746154,
1.6094379124341, 1.6094379124341, 1.94591014905531, 2.19722457733622,
0.693147180559945, 4.26267987704132, 1.6094379124341, 0.693147180559945
)), .Names = c("response", "HIST1H3F_rna", "NCF1_rna", "WDR66_rna",
"PTH2R_rna", "HAVCR2_rna", "CD200R1_rna"), row.names = c(NA,
-15L), class = "data.frame")
However, running the following lines and getting a summary of the model I find that all variables have a p-value of 1 and the standard errors seem so high. What's going on here?
fullmod <- glm(response ~ ., data=final_model,family='binomial')
summary(fullmod)
Call:
glm(formula = response ~ ., family = "binomial", data = final_model)
Deviance Residuals:
Min 1Q Median 3Q Max
-6.515e-06 -2.404e-06 -2.110e-08 2.110e-08 7.470e-06
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.460e+02 5.598e+05 0 1
HIST1H3F_rna 2.135e+01 5.145e+05 0 1
NCF1_rna -4.133e+01 3.388e+05 0 1
WDR66_rna 1.296e+01 6.739e+05 0 1
PTH2R_rna 1.975e+00 3.775e+05 0 1
HAVCR2_rna -2.477e+01 1.191e+06 0 1
CD200R1_rna -1.420e+01 1.315e+06 0 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2.0190e+01 on 14 degrees of freedom
Residual deviance: 2.2042e-10 on 8 degrees of freedom
AIC: 14
Number of Fisher Scoring iterations: 25
In response to your comments I'll show the feature selection step (and the complete dataframe I'm working with below that).
# forward feature selection
library('boot')
z = c()
nullmod <- glm(response ~ 1, data=final_model, family='binomial') ## ‘empty’
fullmod <- glm(response ~ ., data=final_model, family='binomial') ## Full model
first = T
for(x in 1:ncol(final_model)){
stepmod <- step(nullmod, scope=list(lower=formula(nullmod), upper=formula(fullmod)),
direction="forward", data=final_model, steps=x, trace=F)
cv.err <- cv.glm(data=final_model, glmfit=stepmod, K=nrow(final_model))$delta[1]
if (first == T){
first=F
final_features <- stepmod
}else{
if (cv.err < min(z)){ final_features <- stepmod }
}
z[x] <- cv.err
print(paste(x,cv.err))
print(colnames(final_features$model))
}
plot(z, main='Forward Feature Selection GLM Final Model',
xlab='Number of Steps', ylab='LOOCV-error', col='red', type='l')
points(z)
colnames(final_features$model)
summary(final_features)
structure(list(response = c(0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 1L, 1L, 1L), HIST1H3F_rna = c(1.09861228866811, 2.07944154167984,
1.09861228866811, 1.79175946922805, 0, 0, 0, 2.39789527279837,
1.38629436111989, 1.6094379124341, 1.6094379124341, 0.693147180559945,
2.19722457733622, 2.39789527279837, 2.89037175789616), NCF1_rna = c(2.77258872223978,
2.63905732961526, 2.19722457733622, 2.30258509299405, 2.56494935746154,
3.09104245335832, 3.98898404656427, 2.56494935746154, 4.06044301054642,
3.87120101090789, 2.07944154167984, 3.49650756146648, 2.07944154167984,
2.07944154167984, 1.09861228866811), WDR66_rna = c(5.06890420222023,
5.11799381241676, 3.40119738166216, 3.25809653802148, 4.02535169073515,
5.8348107370626, 5.89440283426485, 3.87120101090789, 5.67675380226828,
5.35185813347607, 4.15888308335967, 6.23441072571837, 4.0943445622221,
4.21950770517611, 3.95124371858143), PTH2R_rna = c(0.693147180559945,
0.693147180559945, 1.09861228866811, 0, 6.01126717440416, 6.56526497003536,
5.18178355029209, 0, 4.36944785246702, 2.19722457733622, 1.09861228866811,
3.49650756146648, 0, 0.693147180559945, 1.38629436111989),
HAVCR2_rna = c(4.48863636973214,
3.09104245335832, 2.94443897916644, 3.2188758248682, 3.76120011569356,
3.95124371858143, 2.83321334405622, 2.07944154167984, 4.36944785246702,
3.58351893845611, 1.94591014905531, 4.23410650459726, 1.38629436111989,
1.09861228866811, 1.38629436111989), CD200R1_rna = c(2.484906649788,
0.693147180559945, 1.94591014905531, 0.693147180559945, 2.89037175789616,
2.56494935746154, 1.6094379124341, 1.6094379124341, 1.94591014905531,
2.19722457733622, 0.693147180559945, 4.26267987704132, 1.94591014905531,
0, 0.693147180559945), GDF7 = c(0.2232, -0.7281, 0.0655, -0.7919,
0.175, 0.0891, 0.4396, -0.2774, -0.4079, 0.4069, 0.3057, 0.7371,
-0.4978, -0.5096, -0.0827), HS1BP3 = c(0.2232, -0.7281, 0.0655,
-0.7919, 0.175, 0.0891, 0.4396, -0.2774, -0.4079, 0.4069, 0.3057,
0.7371, -0.4978, -0.5096, -0.0827), NKAIN3 = c(0.4072, 0.3216,
-0.5466, -0.1588, 0.4515, 0.2849, 0.1675, 0.0847, 0.6601, 0.6331,
-0.135, 1.3532, -0.503, -0.1241, 0.2061), UG0898H09 = c(0.4072,
0.3216, -0.5466, -0.1588, 0.4515, 0.2849, 0.1675, 0.0847, 0.6601,
0.6331, -0.135, 1.3532, -0.503, -0.1241, 0.2061), C15orf41 = c(0.122,
-0.7519, -1.1267, -0.7882, -0.1117, -0.5105, -0.3905, -0.6834,
-0.5944, 0.0714, -0.8134, -0.0115, -1.1112, -1.1488, -0.4878),
FAM98B = c(-0.1871, -0.7519, -1.1267, -0.7882, -0.1117, -0.5105,
-0.3905, -0.6834, -0.5944, 0.0714, -0.8134, -0.0115, -1.1112,
-1.1488, -0.4878), SPRED1 = c(-0.1871, -0.7519, -1.1267,
-0.7882, -0.1117, -0.5105, -0.3905, -0.6834, -0.5944, 0.0714,
-0.8134, -0.0115, -1.1112, -1.1488, -0.4878), MPDZ_ex = c(1,
0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0), TPR_ex = c(0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), BUB1B_ex = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), APC_ex = c(0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), ATM_ex = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0), DYNC1LI1_ex = c(0,
0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), TTK_ex = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), PSMG2_ex = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), NegRegMitosis = c(1,
0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0), brca1ness = c(0.037719,
0.900878, 0.013261, 0.900878, 0.659963, 0.005629, 9.8e-05,
0.996336, 0.910072, 0.850776, 0.000613, 0.104428, 0.978114,
0.938767, 0.041696), Methylation = c(0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), LinoleicAcid_Metab = structure(c(2L,
2L, 2L, 2L, 1L, 3L, 2L, 2L, 1L, 5L, 2L, 5L, 1L, 2L, 2L), .Label = c("CYP2E1_high",
"CYP2E1_med", "high", "low", "PLA2G2A_high"), class = "factor"),
Neuro_lr = structure(c(2L, 2L, 1L, 1L, 3L, 3L, 3L, 1L, 3L,
1L, 1L, 3L, 3L, 1L, 1L), .Label = c("1", "2", "3", "4"), class = "factor"),
NOX_signalling = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L), .Label = c("high", "low"
), class = "factor")), .Names = c("response", "HIST1H3F_rna",
"NCF1_rna", "WDR66_rna", "PTH2R_rna", "HAVCR2_rna", "CD200R1_rna",
"GDF7", "HS1BP3", "NKAIN3", "UG0898H09", "C15orf41", "FAM98B",
"SPRED1", "MPDZ_ex", "TPR_ex", "BUB1B_ex", "APC_ex", "ATM_ex",
"DYNC1LI1_ex", "TTK_ex", "PSMG2_ex", "NegRegMitosis", "brca1ness",
"Methylation", "LinoleicAcid_Metab", "Neuro_lr", "NOX_signalling"
), row.names = c(NA, -15L), class = "data.frame")
Summary now gives the following:
Call:
glm(formula = response ~ NegRegMitosis, family = "binomial",
data = final_model)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.971e-06 -3.971e-06 3.971e-06 3.971e-06 3.971e-06
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 25.57 76367.61 0 1
NegRegMitosis -51.13 111790.71 0 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2.0728e+01 on 14 degrees of freedom
Residual deviance: 2.3655e-10 on 13 degrees of freedom
AIC: 4
Number of Fisher Scoring iterations: 24
Again even in a single predictor model, my p-value is 1. The predictor in this case is equal to the response, so it should predict perfectly. Then why is my pvalue 1?