##Load the packages we will need library(glmnet) library(caret) library(plyr) library(randomForest) library(xgboost) ##Read in the data (cfb is the original data, cfb.predict is the data we want to make predictions for) cfb <- read.csv('C:/Users/Charles/Documents/SMU/Statistics Club/Machine Learning/cfb_short.csv') cfb.predict <- read.csv('C:/Users/Charles/Documents/SMU/Statistics Club/Machine Learning/cfb_short_predict.csv') ##Choose how to resample the data to avoid overfitting ##Normally I use repeated 10-fold cross-validation, but in the interest of time will use 10-fold CV ##trnCtrl <- trainControl(method="repeatedcv", number=10, repeats=5) trnCtrl <- trainControl(method="cv", number=10) ################## ##Random Forests## ################## ##Tune the random forest model (should take 5 minutes or so) my.train.rf <- train(x=as.matrix(cfb[,-41]), y=cfb[,41], method="rf", trControl=trnCtrl) ##Identify the optimal tuning parameter #> my.train.rf$bestTune # mtry #2 21 ##Fit the final model and make predictions rf.model <- randomForest(x=as.matrix(cfb[,-41]), y=cfb[,41], mtry=21, xtest=as.matrix(cfb.predict[,-41]), ytest=cfb.predict[,41], ntree=1000, keep.forest=T) ##Check how well the predictions fit rf.pred <- rf.model$test$predicted plot(rf.pred, cfb.predict[,41], xlab='Random Forest Prediction', ylab='Actual Outcome') RMSE(rf.pred, cfb.predict[,41]) #17.36873 ################################ ##Stochastic Gradient Boosting## ################################ ##Fit the xgboost model - notice how similar it looks to the code from before ##I used tuneLength=3 to tell R to try 3 combinations of each tuning parameter ##This will take 5-10 minutes my.train.xgb <- train(x=as.matrix(cfb[,-41]), y=cfb[,41], method="xgbLinear", trControl=trnCtrl, tuneLength=3) my.train.xgb$bestTune # nrounds lambda alpha eta # 50 0.01 0.1 0.3 ##Final XGboost predictions xgb.pred <- predict(my.train.xgb$finalModel, as.matrix(cfb.predict[,-41])) RMSE(xgb.pred, cfb.predict[,41]) # xgb.rmse # 19.12893 ##Plot variable importance importance_matrix <- xgb.importance(colnames(cfb[,-41]), model = my.train.xgb$finalModel) xgb.plot.importance(importance_matrix)