#CHAPTER 7: RELATE OPERATIONAL VARIABLE TO OUTCOME #################################################### #################################################### #LIBRARIES TO LOAD #################################################### library(xlsx); library(psych); library(ggplot2) library(qqplotr); library(ggpubr); library(mice) library(MASS); library(QuantPsyc); library(nlme) library(car); library(dplyr) #################################################### #################################################### ## 8.2: PROCESS OF MACHINE LEARNING #################################################### ## 8.2.1: LOAD, INSPECT AND PREPARE DATA #################################################### #BEGIN DOWNLOAD DATA #Load table from Excel and assign to insure.1 # provided by the user to inform of location of data insure.1<- read.xlsx( "C:\\\\SkillsForCareerSecurity_Datasets.xlsx", sheetName="insurance", header=TRUE) #END DOWNLOAD DATA #################################################### #BEGIN INSPECT DATA #Inspect data head(insure.1) str(insure.1) summary(insure.1) describe(insure.1) #Check for missing data md.pattern(insure.1) #Review the correlation between numeric variables #View shapes of the variables pairs.panels(insure.1[c("expenses", "bmi", "age", "children")]) #END INSPECT DATA #################################################### #BEGIN PREPARE DATA #Add variables to the model #Assign insure.1 to insure.2 insure.2<- insure.1 #Add an age quadratic variable insure.2$age2<- insure.2$age^2 #Add BMI categories insure.2$bmiCl<- as.factor(ifelse(insure.2$bmi >= 30, "Obese", ifelse(insure.2$bmi <= 24.9, "Healthy", "OverWt"))) #Add bmi greater or less than obese insure.2$bmiObese<- as.factor(ifelse(insure.2$bmi >= 30, "Over", "Under")) #Check insure.2 for desired outcomes str(insure.2) #END PREPARE DATA #################################################### ## 8.2.2: DUMMY VARIABLES #################################################### #BEGIN DUMMY VARIABLES #Name contrasts and reference to bmiCl variable str(insure.2$bmiCl) attributes(insure.2$bmiCl) contrasts(insure.2$bmiCl) #Define dummy variables to bmiCl variable OverWt_v_Healthy<- c(0,0,1) Obese_v_Healthy<- c(0,1,0) contrasts(insure.2$bmiCl)<- cbind(OverWt_v_Healthy, Obese_v_Healthy) contrasts(insure.2$bmiCl) #Set reference levels to bmiObese variable #Before contrasts(insure.2$bmiObese) insure.2$bmiObese<- relevel(insure.2$bmiObese, ref = "Under") #After contrasts(insure.2$bmiObese) #END DUMMY VARIABLES #################################################### ## 8.2.3: SELECT THE VARIABLES #################################################### #BEGIN SELECT VARIABLES #Test for standardized effect #Model for effects model.stdEffect<- lm(expenses ~ age + bmi + children, data = insure.2) summary(model.stdEffect) #View comparative effect of 1 std deviation lm.beta(model.stdEffect) #Retain all continuous numeric #Age with all categorical variables model.1<- lm(expenses ~ age + bmiCl + sex + smoker + region, data = insure.2) summary(model.1) confint(model.1) #Add bmi and remove sex and region model.2<- lm(expenses ~ age + bmi + bmiCl + smoker, data = insure.2) summary(model.2) AIC(model.1, model.2) #Replace bmiCl with bmiObese model.3<- lm(expenses ~ age + bmi + bmiObese + smoker, data = insure.2) summary(model.3) AIC(model.1, model.2, model.3) #Add children model.4<- lm(expenses ~ age + bmi + children + bmiObese + smoker, data = insure.2) summary(model.4) AIC(model.1, model.2, model.3, model.4) #END SELECT VARIABLES #################################################### ## 8.2.4: TRANSFORMING VARIABLES FOR LINEARITY #################################################### #BEGIN TRANSFORM AGE #Assign insure.2 to insure.3 insure.3<- insure.2 str(model.4) #Extract standardized residuals from model.4 #Residuals placed in insure.3 insure.3$stdResid.1<- rstandard(model.4) residAge<- ggplot(insure.3, aes(x = age, y = stdResid.1)) + geom_point() + geom_smooth() + facet_grid(smoker~bmiObese) residAge model.5<- lm(expenses ~ age + age2 + bmi + children + bmiObese + smoker, data = insure.3) summary(model.5) insure.3$stdResid.2<- rstandard(model.5) residAgePol<- ggplot(insure.3, aes(x = age, y = stdResid.2)) + geom_point() + geom_smooth() + facet_grid(smoker~bmiObese) residAgePol #Compare models AIC(model.4, model.5) anova(model.4, model.5) age<- ggarrange(residAge, residAgePol, ncol = 2, nrow = 1) age #END AGE TRANSFORMATION ########################################### #BEGIN: INSERT INTERACTION #Interaction creating stdresidual not = 0 #Insert bmiObese and smoker interaction model.6<- lm(expenses ~ age + age2 + bmi + children + bmiObese*smoker, data = insure.3) summary(model.6) insure.3$stdResid.3<- rstandard(model.6) residIntr<- ggplot(insure.3, aes(x = age, y = stdResid.3)) + geom_point() + geom_smooth() + facet_grid(smoker~bmiObese) residIntr AIC(model.4, model.5, model.6) anova(model.4, model.5, model.6) #END: INSERT INTERACTION ########################################### #BEGIN BMI TRANSFORMATION #Evaluate bmi residBmi<- ggplot(insure.3, aes(x = bmi, y = stdResid.3)) + geom_point() + geom_smooth() + facet_grid(smoker~bmiObese) residBmi #END BMI TRANSFORMATION ########################################### #BEGIN CHILDREN TRANSFORMATION residChlInt<- ggplot(insure.3, aes(x = children, y = stdResid.3)) + geom_point() + geom_smooth(method = loess) + facet_grid(smoker~bmiObese) residChlInt #END CHILDREN TRANSFORMATION #################################################### ##7.2.5: TEST THE MODEL FOR GENERALIZATION ########################################### #BEGIN TEST MODEL FOR GENERALIZATION #Test for independence of error durbinWatsonTest(model.6) #Test model.6 for multicolinearity vif(model.6) #add the fitted points of model.6 to table insure.3$fitted<- model.6$fitted.values #Standard stdResid.3 is from model.6 residModel.6<- ggplot(insure.3, aes(fitted, stdResid.3, )) + geom_point(aes(color = bmiObese, shape = smoker)) + geom_smooth(method = "lm", color = "Blue") + facet_grid(smoker~bmiObese) + theme(legend.position = "none") residModel.6 #Test for normal distribution shapiro.test(insure.3$stdResid.3) #Q-Q test qqModel.6<- ggplot(data = insure.3, mapping = aes(sample = stdResid.3)) + stat_qq_line(color = "Blue", size = 2) + stat_qq_point() + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") qqModel.6 #Density plots hstModel.6<- ggplot(insure.3, aes(stdResid.3)) + geom_histogram(aes(y = ..density..), bins=10, alpha = .4) + stat_function(fun = dnorm, args = list(mean = mean(insure.3$stdResid.3, na.rm = TRUE), sd =sd(insure.3$stdResid.3, na.rm = TRUE)), size = 2) hstModel.6 #Figure Q-Q plot and Histogram to model.6 QqHstModel.6<- ggarrange(qqModel.6, hstModel.6, ncol = 2, nrow = 1) QqHstModel.6 #Density model to compare smoker distributions polyDen.Smoke<- ggplot(insure.3, aes(stdResid.3, fill=smoker), bins = 10) + geom_density( alpha = 0.2) + theme(legend.position = c(.9, .9)) polyDen.Smoke #Density model to compare bmiObese distributions polyDen.Obese<- ggplot(insure.3, aes(stdResid.3, fill=bmiObese), bins = 10) + geom_density( alpha = 0.2) + theme(legend.position = c(.85, .9)) polyDen.Obese #Figure histograms to model.6 on smoker bmiObese DenModelInter<- ggarrange(polyDen.Smoke, polyDen.Obese, ncol = 2, nrow = 1) DenModelInter #END TEST MODEL FOR GENERALIZATION #################################################### #################################################### ##7.3: EXTREME CASES CLEANSED BY AI #################################################### ##7.3.1: FLAG CASES THAT CHANGE PARAMETERS #################################################### #BEGIN REMOVE LEVERAGERS & INFLUENCERS #Create variables to leverages cases insure.3$HatLever<- hatvalues(model.6) ##Compute Hat test (leverage) criterion k<- 6 g<- 3 h<- g*(k + 1)/length(insure.3$HatLever) #Form table of leverage cases hatTbl<- insure.3[insure.3$HatLever > h, ] length(hatTbl$HatLever) #Variable to candidate influencers of expectations insure.3$CookInflu<- cooks.distance(model.6) ##Compute Cooks distant test for influencers cookTbl<- insure.3[insure.3$CookInflu > 1, ] length(cookTbl$CookInflu) #View hatTbl and cookTbl points among all points HatCookPts<- ggplot(insure.3, aes(fitted, stdResid.3, )) + geom_point(aes(color = bmiObese, shape = smoker)) + geom_point(data = hatTbl, aes(fitted, stdResid.3, ), shape = 10, size = 3) + geom_point(data = cookTbl, aes(fitted, stdResid.3, ), shape = 5, size = 3) + geom_smooth(method = "lm", color = "Blue") HatCookPts #Create data set with levers and influencers removed clean.1<- insure.3[insure.3$HatLever <= h & insure.3$CookInflu <= 1, ] str(clean.1) #Check total numbers length(insure.3$expenses) length(clean.1$expenses) length(hatTbl$HatLever) + length(cookTbl$CookInflu) #END RMOVE LEVARAGERS AND INFLUENCERS #################################################### ##7.3.2: FLAG OUTLIERS TO THE MAINSTREAM ########################################### #BEGIN REMOVE OUTLIERS #Run model with data absent leverage and influence model.clean.1<- lm(expenses ~ age + age2 + bmi + children + bmiObese*smoker, data = clean.1) summary(model.clean.1) clean.1$stdResid.4<- rstandard(model.clean.1) str(clean.1) #Criteria for candidate outliers to expectations #Use 2.8 for 1 percent cases UpLmt<- 2.58 LwLmt<- -UpLmt residTbl<- clean.1[clean.1$stdResid.4 < LwLmt | clean.1$stdResid.4 > UpLmt , ] length(residTbl$stdResid.4) #View outliers in clean.1 data outlierPts<- ggplot(clean.1, aes(fitted, stdResid.4, )) + geom_point(aes(color = bmiObese, shape = smoker)) + geom_point(data = residTbl, aes(fitted, stdResid.4, ), shape = 10, size = 3) + geom_smooth(method = "lm", color = "Blue") outlierPts #END: REMOVE OUTLIERS #################################################### ##7.3.3: CREATE DATASETS FOR IMPUTATION ########################################### #BEGIN: CREATE CLEAN DIRTY DATASETS #Create data set with dirty data only dirty.1<- clean.1[clean.1$stdResid.4 < LwLmt | clean.1$stdResid.4 > UpLmt | insure.3$HatLever > h | insure.3$CookInflu > 1, ] str(dirty.1) #Remove duplicates from dirty cases - if any dirty.2<- distinct(dirty.1) str(dirty.2) #Create a clean table for AI clean.2<- clean.1[clean.1$stdResid.4 >= LwLmt & clean.1$stdResid.4 <= UpLmt, ] str(clean.2) #END: CREATE CLEAN DIRTY DATASETS #################################################### ##7.3.4: AI TO IMPUTE CLEANSED DATA ########################################### #BEGIN: CLEAN MODEL AND ESTIMATE IMPUTES #Model with clean data model.clean.2<- lm(expenses ~ age + age2 + bmi + children + bmiObese*smoker, data = clean.2) summary(model.clean.2) #Predict impute for the questionable cases dirty.3<- dirty.2[,-7] imputePred<- dirty.3 str(imputePred) #Returns imputePred with vector of predictions imputePred$expenses<- predict(model.clean.2, dirty.3) head(imputePred) str(imputePred) #Create append data set with cols rearranged append.1<- imputePred[,c(1:6,17,7:9)] str(append.1) #Remove extra colums from clean.2 clean.3<- clean.2[,c(1:10) ] #bind append.1 to clean.3 clean.4<- rbind(clean.3, append.1) str(clean.4) model.Imput<- lm(expenses ~ age + age2 + bmi + children + bmiObese*smoker, data = clean.4) summary(model.Imput) #END: MODEL WITH CLEAN DATA #################################################### #################################################### ##7.4: INDIVIDUAL PREDICTIONS AND INTERVALS #################################################### #################################################### #BEGIN: INDIVIDUAL PREDICTIONS #Inputs to the prediction age<- c(25,45) age2<- age^2 children<- c(2, 2) bmi<- c(23, 38) bmiObese<- c("Under","Over") smoker<- c("no","no") EstExpIndiv<- data.frame(age, age2, children, bmi, bmiObese, smoker) EstExpIndiv #Predict and add to data.frame EstExp EstExpIndiv$expenses<- predict(model.Imput, EstExpIndiv, interval = "prediction", level = .95) EstExpIndiv #END: INDIVIDUAL PREDICTIONS #END: SCRIPT ###########################################