#-----------------------------# # How Rare is Rare? # # IMportance of Validation # # # # Dr. Aric LaBarr # # Institute for # # Advanced Analytics # #-----------------------------# # Load Needed Libraries # library(ISLR) # Library for data from "Introduction to Statistical Learning with Applications in R" # library(dplyr) # Library used for data manipulation # library(pROC) # Library for model assessment of logistic regression # # Set Seed - Getting random sample from the data set for modeling purposes # set.seed(12345) # Load Data - Default data from ISLR package # def <- Default # Assigning ID's to the observations since they don't already have them # def$ID <- 1:10000 # Splitting data in training and validation # train <- def %>% group_by(default) %>% sample_n(250) valid <- def[which(!(def$ID %in% train$ID)),] # Logistic Model - Classification model predicting default based on student and income variables # Logit.Model <- glm(default ~ factor(student) + income, data = train, family=binomial(logit)) summary(Logit.Model) # Evaluating the logistic model based on ROC curves # model.roc <- roc(Logit.Model$y, Logit.Model$fitted.values) AUC.True <- auc(model.roc) AUC.True # Target Shuffling - Developing the shuffle of simulations # sim <- 1000 # Setting the number of simulations # Y.Shuffle <- matrix(0, nrow=length(train$default), ncol=sim) for(j in 1:sim){ Uniform <- runif(length(train$default)) Y.Shuffle[,j] <- as.numeric(train$default[order(Uniform)]) - 1 } Y.Shuffle <- data.frame(Y.Shuffle) colnames(Y.Shuffle) <- paste('Y.',seq(1:sim),sep="") train <- data.frame(train, Y.Shuffle) # Calucating the ROC curve from logistic regressions from simulated shuffles of the data # AUC.Y <- rep(0,sim) for(i in 1:sim){ AUC.Y[i] <- auc(roc(glm(train[,5+i] ~ income + factor(student), data = train, family=binomial(logit))$y, glm(train[,5+i] ~ balance + income, data = train, family=binomial(logit))$fitted.values)) } # Plotting histogram of all ROC curve values as well as marking the true model ROC curve # hist(c(AUC.Y,AUC.True), breaks=25, col = "blue", main='Distribution of AUC Values', xlab='Area Under the Curve') abline(v = AUC.True, col="red", lwd=2) mtext("True Model", at=AUC.True, col="red") # How many ROC curves beat our ROC curve from the real data? # sum(AUC.Y > AUC.True) # Only 45 out of 1000 beat out our model - 4.5% of random models beat our model # # The likelihood our model occurred by random chance is 4.5% # # Go back and change the seed number to see what happens with a new sample! #