Model Selection: Forward, Backward Selection

Too many features? Let Forward and Backward selection assist you

There are a couple of libraries that support forward and backward selection such as Caret and MASS. But I find leaps to be the most useful albeit its incompatibility with predict() . However, with a couple more lines, we could bypass the incompatibility and yield the same result.

Let’s load the library, create train and test sets, and create theme for visualization.

##### Load Goodies #####
library(leaps)
library(ggplot2)

##### Create Train and Test #####
#Set the number of observations in the training set
train_number <- round(0.80*nrow(diamonds),digit = 0)

#The integers in 'train_index' represent observations
train_index <- sample(seq_len(nrow(diamonds)), size = train_number) 

train_set <- diamonds[train_index,]
test_set <- diamonds[-train_index,]

##### Theme Moma #####
theme_moma <- function(base_size = 12, base_family = "Helvetica") {
  theme(
    plot.background = element_rect(fill = "#F7F6ED"),
    legend.key = element_rect(fill = "#F7F6ED"),
    legend.background = element_rect(fill = "#F7F6ED"),
    panel.background = element_rect(fill = "#F7F6ED"),
    panel.border = element_rect(colour = "black", fill = NA, linetype = "dashed"),
    panel.grid.minor = element_line(colour = "#7F7F7F", linetype = "dotted"),
    panel.grid.major = element_line(colour = "#7F7F7F", linetype = "dotted")
  )
}

##### Load Goodies #####

library(leaps)

library(ggplot2)

##### Create Train and Test #####

#Set the number of observations in the training set

train_number <- round(0.80*nrow(diamonds),digit = 0)

#The integers in 'train_index' represent observations

train_index <- sample(seq_len(nrow(diamonds)), size = train_number)

train_set <- diamonds[train_index,]

test_set <- diamonds[-train_index,]

##### Theme Moma #####

theme_moma <- function(base_size = 12, base_family = "Helvetica") {

theme(

plot.background = element_rect(fill = "#F7F6ED"),

legend.key = element_rect(fill = "#F7F6ED"),

legend.background = element_rect(fill = "#F7F6ED"),

panel.background = element_rect(fill = "#F7F6ED"),

panel.border = element_rect(colour = "black", fill = NA, linetype = "dashed"),

panel.grid.minor = element_line(colour = "#7F7F7F", linetype = "dotted"),

panel.grid.major = element_line(colour = "#7F7F7F", linetype = "dotted")

)

}

Next, we train the models.

##### Train #####
regfit.fwd <- regsubsets(price ~ ., data = train_set, nvmax = 9, method = "forward")
regfit.bwd <- regsubsets(price ~ ., data = train_set, nvmax = 9, method = "backward")

##### Train #####

regfit.fwd <- regsubsets(price ~ ., data = train_set, nvmax = 9, method = "forward")

regfit.bwd <- regsubsets(price ~ ., data = train_set, nvmax = 9, method = "backward")

As usual, summary() will give an overview of the models

summary(regfit.fwd)

1	summary(regfit.fwd)

> summary(regfit.fwd)
Subset selection object
Call: regsubsets.formula(price ~ ., data = train_set, nvmax = 9, method = "forward")
23 Variables  (and intercept)
          Forced in Forced out
carat         FALSE      FALSE
cut.L         FALSE      FALSE
cut.Q         FALSE      FALSE
cut.C         FALSE      FALSE
cut^4         FALSE      FALSE
color.L       FALSE      FALSE
color.Q       FALSE      FALSE
color.C       FALSE      FALSE
color^4       FALSE      FALSE
color^5       FALSE      FALSE
color^6       FALSE      FALSE
clarity.L     FALSE      FALSE
clarity.Q     FALSE      FALSE
clarity.C     FALSE      FALSE
clarity^4     FALSE      FALSE
clarity^5     FALSE      FALSE
clarity^6     FALSE      FALSE
clarity^7     FALSE      FALSE
depth         FALSE      FALSE
table         FALSE      FALSE
x             FALSE      FALSE
y             FALSE      FALSE
z             FALSE      FALSE
1 subsets of each size up to 9
Selection Algorithm: forward
         carat cut.L cut.Q cut.C cut^4 color.L color.Q color.C color^4 color^5 color^6
1  ( 1 ) "*"   " "   " "   " "   " "   " "     " "     " "     " "     " "     " "    
2  ( 1 ) "*"   " "   " "   " "   " "   " "     " "     " "     " "     " "     " "    
3  ( 1 ) "*"   " "   " "   " "   " "   "*"     " "     " "     " "     " "     " "    
4  ( 1 ) "*"   " "   " "   " "   " "   "*"     " "     " "     " "     " "     " "    
5  ( 1 ) "*"   " "   " "   " "   " "   "*"     "*"     " "     " "     " "     " "    
6  ( 1 ) "*"   " "   " "   " "   " "   "*"     "*"     " "     " "     " "     " "    
7  ( 1 ) "*"   " "   " "   " "   " "   "*"     "*"     " "     " "     " "     " "    
8  ( 1 ) "*"   "*"   " "   " "   " "   "*"     "*"     " "     " "     " "     " "    
9  ( 1 ) "*"   "*"   " "   " "   " "   "*"     "*"     " "     " "     " "     " "    
         clarity.L clarity.Q clarity.C clarity^4 clarity^5 clarity^6 clarity^7 depth table
1  ( 1 ) " "       " "       " "       " "       " "       " "       " "       " "   " "  
2  ( 1 ) "*"       " "       " "       " "       " "       " "       " "       " "   " "  
3  ( 1 ) "*"       " "       " "       " "       " "       " "       " "       " "   " "  
4  ( 1 ) "*"       "*"       " "       " "       " "       " "       " "       " "   " "  
5  ( 1 ) "*"       "*"       " "       " "       " "       " "       " "       " "   " "  
6  ( 1 ) "*"       "*"       " "       " "       " "       " "       " "       " "   " "  
7  ( 1 ) "*"       "*"       "*"       " "       " "       " "       " "       " "   " "  
8  ( 1 ) "*"       "*"       "*"       " "       " "       " "       " "       " "   " "  
9  ( 1 ) "*"       "*"       "*"       "*"       " "       " "       " "       " "   " "  
         x   y   z  
1  ( 1 ) " " " " " "
2  ( 1 ) " " " " " "
3  ( 1 ) " " " " " "
4  ( 1 ) " " " " " "
5  ( 1 ) " " " " " "
6  ( 1 ) "*" " " " "
7  ( 1 ) "*" " " " "
8  ( 1 ) "*" " " " "
9  ( 1 ) "*" " " " "

> summary(regfit.fwd)

Subset selection object

Call: regsubsets.formula(price ~ ., data = train_set, nvmax = 9, method = "forward")

23 Variables (and intercept)

Forced in Forced out

carat FALSE FALSE

cut.L FALSE FALSE

cut.Q FALSE FALSE

cut.C FALSE FALSE

cut^4 FALSE FALSE

color.L FALSE FALSE

color.Q FALSE FALSE

color.C FALSE FALSE

color^4 FALSE FALSE

color^5 FALSE FALSE

color^6 FALSE FALSE

clarity.L FALSE FALSE

clarity.Q FALSE FALSE

clarity.C FALSE FALSE

clarity^4 FALSE FALSE

clarity^5 FALSE FALSE

clarity^6 FALSE FALSE

clarity^7 FALSE FALSE

depth FALSE FALSE

table FALSE FALSE

x FALSE FALSE

y FALSE FALSE

z FALSE FALSE

1 subsets of each size up to 9

Selection Algorithm: forward

carat cut.L cut.Q cut.C cut^4 color.L color.Q color.C color^4 color^5 color^6

1 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " "

2 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " "

3 ( 1 ) "*" " " " " " " " " "*" " " " " " " " " " "

4 ( 1 ) "*" " " " " " " " " "*" " " " " " " " " " "

5 ( 1 ) "*" " " " " " " " " "*" "*" " " " " " " " "

6 ( 1 ) "*" " " " " " " " " "*" "*" " " " " " " " "

7 ( 1 ) "*" " " " " " " " " "*" "*" " " " " " " " "

8 ( 1 ) "*" "*" " " " " " " "*" "*" " " " " " " " "

9 ( 1 ) "*" "*" " " " " " " "*" "*" " " " " " " " "

clarity.L clarity.Q clarity.C clarity^4 clarity^5 clarity^6 clarity^7 depth table

1 ( 1 ) " " " " " " " " " " " " " " " " " "

2 ( 1 ) "*" " " " " " " " " " " " " " " " "

3 ( 1 ) "*" " " " " " " " " " " " " " " " "

4 ( 1 ) "*" "*" " " " " " " " " " " " " " "

5 ( 1 ) "*" "*" " " " " " " " " " " " " " "

6 ( 1 ) "*" "*" " " " " " " " " " " " " " "

7 ( 1 ) "*" "*" "*" " " " " " " " " " " " "

8 ( 1 ) "*" "*" "*" " " " " " " " " " " " "

9 ( 1 ) "*" "*" "*" "*" " " " " " " " " " "

x y z

1 ( 1 ) " " " " " "

2 ( 1 ) " " " " " "

3 ( 1 ) " " " " " "

4 ( 1 ) " " " " " "

5 ( 1 ) " " " " " "

6 ( 1 ) "*" " " " "

7 ( 1 ) "*" " " " "

8 ( 1 ) "*" " " " "

9 ( 1 ) "*" " " " "

If it were an lm() function, we could just use predict() . But as predict() doesn’t support regsubsets class. We need a workaround as follows:

##### Create Model Matrix #####
test.mat <- model.matrix(price ~ ., data = test_set )

1 2	##### Create Model Matrix ##### test.mat <- model.matrix(price ~ ., data = test_set )

Next, we use a for loop() to perform matrix multiplication %*% then manually calculate $RMSE$.

##### RMSE #####
rmse <- rep(NA,9) #set up a container

for (i in 1:9){
  coef <- coef(regfit.fwd, id = i) #This will extract columns from model number x
  yhat <- test.mat[,names(coef)] %*% coef #get only the value from test set then matrix multiplcation with coef
  rmse[i] <- sqrt(mean((test_set$price-yhat)^2)) #extract salary in test set with other steps to get rmse
}

##### RMSE #####

rmse <- rep(NA,9) #set up a container

for (i in 1:9){

coef <- coef(regfit.fwd, id = i) #This will extract columns from model number x

yhat <- test.mat[,names(coef)] %*% coef #get only the value from test set then matrix multiplcation with coef

rmse[i] <- sqrt(mean((test_set$price-yhat)^2)) #extract salary in test set with other steps to get rmse

}

Then, we visualize.

##### Visualize ######
visualize <- data.frame(rmse, model = (1:9))

ggplot(visualize, aes(x = model, y = rmse)) +
  geom_line() +
  geom_point(color = "red", size = 3) +
  theme_moma() +
  scale_x_discrete(limits = c(1:9)) +
  ylab("RMSE") +
  xlab("Model Number") +
  ggtitle("RMSE")

##### Visualize ######

visualize <- data.frame(rmse, model = (1:9))

ggplot(visualize, aes(x = model, y = rmse)) +

geom_line() +

geom_point(color = "red", size = 3) +

theme_moma() +

scale_x_discrete(limits = c(1:9)) +

ylab("RMSE") +

xlab("Model Number") +

ggtitle("RMSE")

It seems like model 9th has the lowest test $RMSE$. With the loop, we can calculate $RMSE$ on the test set with Best Subset Selection, Backward, or Stepwise if we wish.