##CHAPTER 14: RECOVER LOST CLASSIFICATION #################################################### #LIBRARY #################################################### library(xlsx) #LOGISTIC REGRESSION library(car); library(mlogit) #TEXT MINING library(tm); library(SnowballC); library(RColorBrewer) library(wordcloud); library(e1071); library(gmodels) #################################################### #################################################### ##14.2. LOGISTIC REGRESSION #################################################### #14.2.2 CASE, DATASET AND PREPARATION #################################################### #PACKAGES: library(car); library(mlogit) #LOAD DATA # provided by the user to inform of location of data OrigLogisMode<- read.xlsx( "C:\\\\SkillsForCareerSecurity_Datasets.xlsx", sheetName="tblLogisClass", header=TRUE) LogisMode<- OrigLogisMode head(LogisMode); str(LogisMode) #SET BASELINE CATEGORIES CONTRARY TO ALPHA ORDER LogisMode$IdFailMode<-relevel(LogisMode$IdFailMode, "OthrMode") #################################################### #14.2.3. BUILD AND INTERPRET MODEL #################################################### #CREATE HIERARCHICAL MODELS LogitModel.1 <- glm(IdFailMode ~ MntcStrat, data = LogisMode, family = binomial()) LogitModel.2 <- glm(IdFailMode ~ MntcStrat + TmBtwnFlr, data = LogisMode, family = binomial()) #RETTURN MODELS summary(LogitModel.1) summary(LogitModel.2) #COMPARE NULL TO MODEL1 modelChi <- LogitModel.1$null.deviance - LogitModel.1$deviance chidf <- LogitModel.1$df.null - LogitModel.1$df.residual chisq.prob <- 1 - pchisq(modelChi, chidf) #REPORT DIFF IN CHI AND DEGREE OF FREEDOM AND P-VALUE modelChi; chidf; chisq.prob #COMPUTE ODDS RATIO AND CONFIDENCE LIMITS exp(LogitModel.1$coefficients) exp(confint(LogitModel.1)) #COMPARE MODEL1 AND MODEL2 modelChi <- LogitModel.1$deviance - LogitModel.2$deviance chidf <- LogitModel.1$df.residual - LogitModel.2$df.residual chisq.prob <- 1 - pchisq(modelChi, chidf) modelChi; chidf; chisq.prob #################################################### #14.2.4. CLASSIFICATION AS PROBABILITY #################################################### #ATTACH PROBALITIES OF THE MODEL TO THE DATASET #PRESENT LOGITMODEL.2 AS GRANULAR LogisMode$predicted.probabilities<-fitted(LogitModel.2) head(LogisMode[, c("IdFailMode", "MntcStrat", "TmBtwnFlr", "predicted.probabilities")]) tail(LogisMode[, c("IdFailMode", "MntcStrat", "TmBtwnFlr", "predicted.probabilities")]) #################################################### #14.2.5. ANALYSIS OF RESIDUALS #################################################### #ADD DIAGNOSTICS TO DATASET LogisMode$standardized.residuals<-rstandard(LogitModel.2) LogisMode$studentized.residuals<-rstudent(LogitModel.2) LogisMode$dfbeta<-dfbeta(LogitModel.2) LogisMode$dffit<-dffits(LogitModel.2) LogisMode$leverage<-hatvalues(LogitModel.2) LogisMode[, c("leverage", "studentized.residuals", "dfbeta")] #################################################### #14.2.6. RECOVERING LOST CLASSIFICATIONS #################################################### #TABLES FOR PREDICTION WorkOrdr<- as.factor(c(02002, 02039)) MntcStrat<- c("OnInterval", "OnFailure") tblSampleSet<- data.frame(WorkOrdr, MntcStrat) ModeProb<- data.frame(MntcStrat) #GENERATE PREDICTION AND LIMITS ClassProbCl<- cbind(ModeProb, predict(LogitModel.1, newdata = ModeProb, type = "link", se = TRUE)) cl<- 1.96 ClassProbCl <- within(ClassProbCl, { PredictedProb <- plogis(fit) LL <- plogis(fit - (cl * se.fit)) UL <- plogis(fit + (cl * se.fit)) }) head(ClassProbCl); str(ClassProbCl) #TABLE OF CLASSIFIED CASES tblClassified<- data.frame(WorkOrder=tblSampleSet$WorkOrdr, ClassProbCl[5:7]) tblClassified #################################################### #14.2.7. TEST FOR MULTICOLINARITY AND LINEARITY #################################################### #TEST FOR LINEARIGY AND COLINEARITY #NEW MODEL WITH NUMERIC VARIABLES #LOAD DATA # provided by the user to inform of location of data OrigLinColin<- read.xlsx( "C:\\\\SkillsForCareerSecurity_Datasets.xlsx", sheetName="tblLinColin", header=TRUE) LinColin<- OrigLinColin head(LinColin); str(LinColin) #CREATE THE MODEL colinearityModel <- glm(Scored ~ Previous + PSWQ + Anxious, data = LinColin, family = binomial()) summary(colinearityModel) #TEST FOR CONLINEARIITY, OVER 10 REVEALS CONLINEARITY vif(colinearityModel) #VIEW CORRELATIONS: VERY HIGH IS INDICATIVE cor(LinColin[, c("Previous", "PSWQ", "Anxious")]) #TEST FOR LINEARITY #CREATE THE INTERACTION OF VARIABLES WITH THEIR LOGS LinColin$logPSWQInt<-log(LinColin$PSWQ)*LinColin$PSWQ LinColin$logAnxInt<-log(LinColin$Anxious)*LinColin$Anxious LinColin$logPrevInt<-log(LinColin$Previous + 1)*LinColin$Previous #VIEW VARIABLES ATTACHED TO TABLE head(LinColin) #RUN MODEL DESIGN TO TEST FOR LINEARITY linearityTest <- glm(Scored ~ PSWQ + Anxious + Previous + logPSWQInt + logAnxInt + logPrevInt, data=LinColin, family=binomial()) summary(linearityTest) #################################################### #14.2.8. LOGISTIC REGRESSION AS INSIGHT #################################################### #CREATE DEMO DATASET tblPenalty<- OrigLinColin head(tblPenalty) #CREATE MODEL FOR AI modelPenalty<- glm(Scored ~ Previous + PSWQ + Anxious, data = tblPenalty, family = binomial()) #CREATE DATAFRAME AS THE "NEWDATA" SET probSample<- data.frame( Previous=tblPenalty$Previous, PSWQ=tblPenalty$PSWQ, Anxious=tblPenalty$Anxious) head(probSample) #GENERATE TABLE OF DATASET AND FIT VARIABLES ClassProbs<- cbind(tblPenalty, predict(modelPenalty, newdata = probSample, type = "link", se = TRUE)) #GENERATE PREDICTION AND LIMITS cl<- 1.96 ClassProbs <- within(ClassProbs, { PredictedProb <- plogis(fit) LL <- plogis(fit - (cl * se.fit)) UL <- plogis(fit + (cl * se.fit)) }) head(ClassProbs); str(ClassProbs) #TABLE OF DATASET EXTENDED FOR PROB & LIMITS ClassDataSet<- ClassProbs[,c(1:4,8:10)] head(ClassDataSet) #################################################### #################################################### ##14.3. CLASSIFICATION ON TEXT VARIABLES #################################################### #PACKAGES #library(tm); library(SnowballC); library(RColorBrewer) #library(wordcloud); library(e1071); library(gmodels) #################################################### #14.3.2. CASE, DATASET AND EXPLORATION #################################################### #DOWNLOAD DATA # provided by the user to inform of location of data OrgMsgRaw<- read.xlsx( "C:\\\\SkillsForCareerSecurity_Datasets.xlsx", sheetName="tblTextClassifer", header=TRUE, stringsAsFactors=FALSE) MsgRaw<- OrgMsgRaw MsgRawCloud<- OrgMsgRaw MsgRaw$Type<- factor(MsgRaw$Type) str(MsgRaw); head(MsgRaw) #REMOVE ONE OR MORE SEQUENTIAL ITEMS IN TEXT MsgRaw$TextGsub<- gsub('[[:punct:]]+',' ', MsgRaw$Text) MsgRaw$Text<- MsgRaw$TextGsub head(MsgRaw$Text) #################################################### #14.3.3. CLEANSING AND STANDARDIZING TEXT #################################################### #CREATE CORPUS msg_corpus<- VCorpus(VectorSource(MsgRaw$Text)) print(msg_corpus) #INSPECT MESSAGES IN THE CORPUS inspect(msg_corpus[1:2]) #RETURN THE TEXT RECORD IN THE BRACKETS as.character(msg_corpus[[1]]) #RETURN SERIES OF DOCUMENT lapply(msg_corpus[1:2], as.character) #TRANSFORM ALL TEXT TO LOWER CASE AND CONFIRM msg_corpus_clean<- tm_map(msg_corpus, content_transformer(tolower)) #REMOVE NUMBERS msg_corpus_clean<- tm_map(msg_corpus_clean, removeNumbers) #REMOVE STOPWORDS msg_corpus_clean<- tm_map(msg_corpus_clean, removeWords, stopwords()) #REMOVE PUNCTUATION msg_corpus_clean<- tm_map(msg_corpus_clean, removePunctuation) #STEMMING msg_corpus_clean<- tm_map(msg_corpus_clean, stemDocument) #STRIP WHITESPACE msg_corpus_clean<- tm_map(msg_corpus_clean, stripWhitespace) #VIEW RESULTS OF CLEANSING lapply(msg_corpus_clean[1:2], as.character) #################################################### #14.3.4. VISUALIZE TEXT AS WORD CLOUDS #################################################### #CREATE WORDCLOUDS wordcloud(msg_corpus_clean, min.freq=50, random.order=FALSE) #SUBSET RAW DATASET FOR CLOUDS BY TYPE spam<- subset(MsgRawCloud, Type=="spam") ham<- subset(MsgRawCloud, Type=="ham") str(spam); str(ham) #PLOT WORD CLOUDS BY TYPE par(mfrow=c(1,2)) wordcloud(spam$Text, max.words=40, random.order=FALSE, scale=c(3,0.5)) wordcloud(ham$Text, max.words=40, random.order=FALSE, scale=c(3,0.5)) par(mfrow=c(1,1)) #################################################### #14.3.5. CREATE THE DATA TERM MATRIX #################################################### ##CREATE DOCUMENT TERM MATRIX msg_dtm<- DocumentTermMatrix(msg_corpus_clean) msg_dtm #################################################### #14.3.6. CREATE TRAINING AND TEST DATASETS #################################################### #CREATE TRAIN AND TEST DATASETS msg_dtm_train<- msg_dtm[1:4169,] msg_dtm_test<- msg_dtm[4170:5559,] #SAVE VECTOR OF LABELS msg_train_labels<- MsgRaw[1:4169,]$Type msg_test_labels<- MsgRaw[4170:5559,]$Type #CONFIRM SUBSETS AS REPRESENTATIVE prop.table(table(msg_train_labels)) prop.table(table(msg_test_labels)) #################################################### #14.3.7. CREATE INDICATORS OF FREQUENT WORDS #################################################### #ATTACH FREQ WORDS TO THE DTM msg_freq_words<- findFreqTerms(msg_dtm_train, 5) str(msg_freq_words) #REDUCE DTM TO ONLY MSG_FREQ_WORDS msg_dtm_freq_train<- msg_dtm_train[,msg_freq_words] msg_dtm_freq_test<- msg_dtm_test[,msg_freq_words] #FUNCTION TO CONVERT DTM TO CATEGORICAL convert_counts<- function(x) { x<- ifelse(x>0, "Yes","No") } #YES/NO TO DTMS msg_train<- apply(msg_dtm_freq_train, MARGIN=2, convert_counts) msg_test<- apply(msg_dtm_freq_test, MARGIN=2, convert_counts) #################################################### #14.3.8. TRAIN AND TEST MODEL #################################################### #TRAIN A MODEL ON THE DATA msg_classifier<- naiveBayes(msg_train, msg_train_labels) #EVALUATING MODEL PERFORMANCE #NOTE TAKES SOME TIME, BE PATIENT msg_test_pred<- predict(msg_classifier, msg_test) #TABLE TO EVALUATE TRAINED MODEL CrossTable(msg_test_pred, msg_test_labels, prop.chisq=FALSE, prop.t=FALSE, prop.r=FALSE, dnn=c("predicted", "actual")) #IMPROVING MODEL PERFORMANCE WITH LAPLACE msg_classifier2<- naiveBayes(msg_train, msg_train_labels, laplace=1) #NOTE TAKES SOME TIME, BE PATIENT msg_test_pred2<- predict(msg_classifier2, msg_test) #TABLE TO COMPARE TO FIRST MODEL TO IMPROVED MODEL CrossTable(msg_test_pred2, msg_test_labels, prop.chisq=FALSE, prop.t=FALSE, prop.r=FALSE, dnn=c("predicted", "actual")) #################################################### #14.3.9. CLASSIFY CASES #################################################### #DEMO DATASET FROM PREPARED DTM UnClassSet<- msg_test head(UnClassSet) #MSG_CLASSIFIER2 PREDICTS CLASSIFICATIONS msg_Unclass_pred<- predict(msg_classifier2, UnClassSet) predDf<- as.data.frame(msg_Unclass_pred) head(predDf) #PLACE RAW MESSAGES IN DATA FRAME message<- MsgRaw[4170:5559,2] messDf<- as.data.frame(message) head(messDf) #RETURN ESTIMATED CLASSIFICATION FOR EACH MESSAGE Classifications<- data.frame(predDf, messDf) head(Classifications, 20)