When feeding a categorical variable into glmnet
do I code n
or n-1
dummy variables?
For instance if using days of the week as an independent variable would I use 6 dummies or 7?
If the answer is 6, how do I interpret coefficients, etc for dropped category?
EDIT: Here's some example code:
library(glmnet)
library(caret)
df1 <- data.frame(id = 1:210, var1 = rep(c('Mon','Tues','Wed','Thurs','Fri','Sat','Sun'),30))
df1$targetVar <- runif(210)
df1$mktVol <- round(runif(210)*1000000,0)
df1$mktVol <- ifelse(df1$var1 %in% c('Sat','Sun'), 0, df1$mktVol)
df1
vtu <- c('mktVol','var1')
dv1 <- dummyVars( ~.,data = df1[,vtu])
df2 <- data.frame(predict(dv1,df1))
glmnet1 <- cv.glmnet(df2$targetVar, data.matrix(df2[,-c('targetVar')]), nfolds = 5)
glmnet1 <- cv.glmnet( data.matrix(df2[,-1]), df2[,"mktVol"] ,
family="gaussian", alpha=.95, nfolds=5, standardize = FALSE,
type.measure="mse")
Coefficients1 <- coef(glmnet1, s = glmnet1$lambda.min)
Active.Index <- which(Coefficients != 0)
Active.Coefficients <- Coefficients[Active.Index]
names(X1)[varsToUse[Active.Index]]
##############################
df1 <- data.frame(id = 1:210, var1 = rep(c('Mon','Tues','Wed','Thurs','Fri','Sat','Sun'),30))
df1$targetVar <- runif(210)
df1$mktVol <- round(runif(210)*1000000,0)
df1$mktVol <- ifelse(df1$var1 %in% c('Sat','Sun'), 0, df1$mktVol)
df1
vtu <- c('mktVol','var1')
#dv1 <- dummyVars( ~.,data = df1[,vtu])
#df2 <- data.frame(predict(dv1,df1))
dv1 <- model.matrix(~.,data = df1[,vtu])
#glmnet1 <- cv.glmnet(df2$targetVar, data.matrix(df2[,-c('targetVar')]), nfolds = 5)
glmnet1 <- cv.glmnet( data.matrix(df2[,-1]), df2[,"mktVol"] ,
family="gaussian", alpha=.95, nfolds=5, standardize = FALSE,
type.measure="mse")
Coefficients2 <- coef(glmnet1, s = glmnet1$lambda.min)
##############################
df1 <- data.frame(id = 1:210, var1 = rep(c('Mon','Tues','Wed','Thurs','Fri','Sat','Sun'),30))
df1$targetVar <- runif(210)
df1$mktVol <- round(runif(210)*1000000,0)
df1$mktVol <- ifelse(df1$var1 %in% c('Sat','Sun'), 0, df1$mktVol)
df1
vtu <- c('mktVol','var1')
#dv1 <- dummyVars( ~.,data = df1[,vtu])
#df2 <- data.frame(predict(dv1,df1))
dv1 <- model.matrix(~ 0+ .,data = df1[,vtu])
#glmnet1 <- cv.glmnet(df2$targetVar, data.matrix(df2[,-c('targetVar')]), nfolds = 5)
glmnet1 <- cv.glmnet( data.matrix(df2[,-1]), df2[,"mktVol"] ,
family="gaussian", alpha=.95, nfolds=5, standardize = FALSE,
type.measure="mse")
Coefficients3 <- coef(glmnet1, s = glmnet1$lambda.min)
Coefficients1
Coefficients2
Coefficients3