Running the anova() function on a linear regression object returns p-values for each parameter in my model. However, I do not understand what specific test it is running (I think it is a Wald Chi-Square) or how it chose it.
The documentation for the anova function is very sparse and does not explain this detail- is the function computing the F ratio of the model with the parameter and without or is it doing something different? How does the Wald Chi-Square test differ from ANOVA?
Edit:
I am running regression (specifically geepack) on a dataset. The output returns the parameters (betas), including each level of the categorical variables, along with p-values.
My first question is what specific test is being run on each of these parameters? Is it testing whether beta = 0 or is it testing if the parameter is contributing to the model (or is this the same thing)?
Next, I eliminate covariates that are p<0.1 by running anova() on the model to determine which variables are significant. Instead of looking at the categorical variables by level as before, it tests the variable overall and determines a p-value.
Again, I am not sure what exact test it is using to get a p-value and what the test is asking (beta = 0 or contributes to model?)
Edit #2:
library(tidyverse)
library(broom)
library(geepack)
data <- structure(list(Cluster = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 6L, 6L, 6L), Visit = c(0, 18,
24, 0, 6, 18, 24, 0, 6, 12, 24, 18, 0, 6, 18, 24, 6, 0, 6, 18
), Gender = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("Male", "Female"
), class = "factor"), Visit_age = c(37, 38, 39, 22, 23, 24, 24,
20, 21, 21, 22, 22, 36, 37, 38, 38, 22, 42, 42, 43), AHQ_YearsSimilarFreq = c(13,
13, 13, 5, 5, 5, 5, 12, 12, 12, 12, 12, 6, 6, 6, 6, 11, 4, 4,
4), AHQ_Out_Games_MainPos = structure(c(3L, 3L, 3L, 4L, 4L, 4L,
4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L), .Label = c("Forward",
"Midfield", "Defense", "Goaltender"), class = "factor"), AlcWeek_Category = structure(c(2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L), .Label = c("I do not drink", "Light Drinking (M:1-7, F:1-2)",
"Mod-Heavy Drinking (M:8+, F:3+)"), class = "factor"), CigsYN = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), Concussion = structure(c(3L,
3L, 3L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), .Label = c("0", "1", "2+"), class = "factor"), SchoolYears = c(19,
19, 19, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20, 20, 20, 15,
20, 20, 20), MHQ_Heading_Male_Quartile = structure(c(NA, NA,
NA, 4L, 4L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 1L, 3L, 2L, 3L, NA, NA,
NA, NA), .Label = c("0-6", "6.01-15", "15.01-53", "53.01+"), class = "factor"),
AHQ_Heading_Male_Quartile = structure(c(NA, NA, NA, 4L, 4L,
4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA, NA
), .Label = c("0-435", "435.01-861", "861.01-1964", "1964.01+"
), class = "factor"), MHQ_Heading_Female_Quartile = structure(c(1L,
1L, 2L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
4L, 2L, 3L, 3L), .Label = c("0-2", "2.1-7", "7.1-24", "24.1+"
), class = "factor"), AHQ_Heading_Female_Quartile = structure(c(1L,
1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
3L, 2L, 2L, 2L), .Label = c("0-108", "108.1-430", "430.1-1278",
"1278.1+"), class = "factor"), MHQ_Unintentional_Impacts_Category = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 1L, 2L, 1L, 1L, 1L, 1L,
3L, 1L, 3L, 2L), .Label = c("0", "1", "2+"), class = "factor"),
AHQ_Unintentional_Impacts_Category = structure(c(3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 3L,
3L, 3L, 3L), .Label = c("0", "1", "2+"), class = "factor"),
ISL_cor = c(27, 33, 30, 29, 28, 28, 29, 33, 28, 32, 27, 25,
26, 23, 22, 24, 26, 31, 31, 30), GMCT_mps = c(0.066638, 0.166589,
1.06656, 1.83315, 1.39986, 1.366439, 1.732987, 1.89962, 1.599627,
1.76649, 1.79964, 1.49935, 0.69993, 0.499967, 1.133031, 1.033575,
1.866542, 1.932174, 1.866418, 1.666167), IDN_lmn = c(2.712426,
2.666696, 2.691574, 2.60214, 2.66739, 2.666007, 2.683421,
2.64958, 2.637634, 2.592968, 2.627395, 2.626299, 2.794555,
2.799669, 2.883404, 2.76647, 2.668145, 2.691837, 2.711009,
2.665336), ONB_acc = c(NA, 1.570796, 1.570796, NA, 1.2692,
1.188903, 1.322056, NA, 1.393086, 1.393086, 1.322056, 1.393086,
NA, 1.322056, 1.225939, 1.2692, 1.393086, NA, 1.2692, 1.322056
), TWOB_acc = c(1.325818, 1.570796, 1.570796, 1.230959, 1.162158,
1.570796, 1.325818, 1.230959, 1.395827, 1.325818, 1.273674,
1.395827, 1.021329, 1.230959, 1.273674, 1.325818, 1.230959,
1.273674, 1.395827, 1.570796), ISRL_cor = c(9, 10, 12, 11,
11, 11, 10, 12, 12, 12, 11, 11, 10, 10, 9, 8, 11, 11, 12,
12), PA_Total = c(21, 23, 24, 20, 13, 20, 21, 21, 24, 23,
24, 21, 16, 21, 22, 23, 23, 19, 22, 22), Spatial = c(0.451184,
0.166667, 0.083333, 0.5, 1.339256, 1.533743, 1.622678, 0.333333,
0.166667, 0.416667, 0.333333, 0.25, 0.970857, 1.235702, 2.41257,
1.27022, 0.686887, 1.206559, 0.436339, 1.235702), Flanker = c(206.48,
106.09, 68.9, 155.93, 102.03, 129.46, 135.85, 89.29, 134.4,
125.57, 104.29, 105.33, 182.3, 108.25, 92, 109.75, 93.85,
41.9, 75.25, 16.43)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
# Start of GEE Modeling --------------------------------------------------------------------------------------------------
covs <- c("Visit_age", "AHQ_YearsSimilarFreq", "AHQ_Out_Games_MainPos",
"AlcWeek_Category", "CigsYN", "Concussion", "SchoolYears")
# Fit GEE model
model <- function(new_data, outcome, preds, covs) {
form = as.formula(paste(outcome, paste(c(preds, covs), collapse = " + "), sep = " ~ "))
fit <- geeglm(form, family = gaussian(link = "identity"), data = new_data,
id = Cluster, waves = Visit, corstr = "independence", std.err = "san.se")
return(fit)
}
run <- function(gender, outcome, preds) {
# New dataframe to model
new_data <- data %>%
arrange(Cluster, Visit) %>%
filter(Gender == gender) %>%
select(Cluster, Visit, outcome, preds, covs) %>%
na.omit() # geeglm only works on complete data
# First run with all covariates
fit <- model(new_data, outcome, preds, covs)
# Retain covariates if p < 0.1
covs <- tidy(anova(fit)) %>%
filter(term %in% covs, p.value < 0.1) %>%
pull(term)
#Second run with significant covariates only
fit <- model(new_data, outcome, preds, covs)
output <- tidy(fit) %>%
filter(str_detect(term, paste(paste0("^", preds),
collapse = "|")))
return(list(output, covs))
}
run("Male", "ISL_cor", "MHQ_Heading_Male_Quartile")