#CHAPTER 5: LAYERED CHARTING TO KNOW THY DATA #################################################### #################################################### #LIBRARIES TO LOAD #################################################### library(xlsx); library(mice); library(ggplot2) library(qqplotr); library(ggm); library(Hmisc) library(polycor); library(rlm); library(MASS) library(ggpubr); library(psych); library(nlme) #################################################### #################################################### #5.1: INSPECT FOR MISSING DATA #################################################### #5.1.1 FROM SUPER TABLE TO R #################################################### #Load table from Exel and assign to object # provided by the user to inform of location of data sprTbl<- read.xlsx( "C:\\\\SkillsForCareerSecurity_Datasets.xlsx", sheetName="tblSuperTable", header=TRUE) #View data frame, notice as missing data sprTbl #Removes ID column from data frame #Returns data frame with only 2nd through 6th variables sprTbl<- sprTbl[,2:6] #Returned data frame with ID variable omitted head(sprTbl) #################################################### #5.1.2 FIND THE MISSING DATA #################################################### #Generate table and plots of counts and composition of NA #NOTE: If plot already run, close the Graphic panel before rerun md.pattern(sprTbl) #OPTIONAL #Output device if wish to store the plot #Device png() outputs plot as a *.png file png("C:\\\\varablePlot.png") md.pattern(sprTbl) dev.off() #Alterative is to use sniping tool #Generate a data frame of records with missing data records<- sprTbl[!complete.cases(sprTbl),] records #Write the data frame records to Excel file as ListOrdersNA sheet write.xlsx(records, file="C:\\\\", sheetName="ListOrdersNA", row.names=FALSE, append=TRUE ) #################################################### #################################################### #5.2: VISUAL AND STATISTICAL INSPECTION #################################################### ##5.2.1: LOAD AND SURVEY THE DATA #################################################### #Load table from Excel and assign to object mpgKtd<- read.xlsx( "C:\\\\DataBookAssetMgtXlsx.xlsx", sheetName="MpgDataSet", header=TRUE) mpgKtd<- read.xlsx( "C:\\\\SkillsForCareerSecurity_Datasets.xlsx", sheetName="MpgDataSet", header=TRUE) #Figures 6-7 thru 6-11 #Survey views of data head(mpgKtd) str(mpgKtd) summary(mpgKtd) md.pattern(mpgKtd) describe(mpgKtd) #################################################### #5.2.2: TEST FOR NORMAL DISTIRIBUTION #################################################### #Highway compared to normal distribution #Hwy variable qqhwy<- ggplot(data = mpgKtd, mapping = aes(sample = hwy)) + stat_qq_band() + stat_qq_line() + stat_qq_point(aes(color=class)) + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("Q-Q Test of the Hiqhway Variable") qqhwy #Test hwy variable for normal distribution ##Small p-value indicates is not normal. shapiro.test(mpgKtd$hwy) #Subset the test by facet qqhwyFac<- qqhwy + facet_wrap(~class) + theme(legend.position = "none") qqhwyFac #Test subsets to hwy for normality on class shapiro.test(mpgKtd$hwy[mpgKtd$class=="2seater"]) shapiro.test(mpgKtd$hwy[mpgKtd$class=="compact"]) shapiro.test(mpgKtd$hwy[mpgKtd$class=="midsize"]) shapiro.test(mpgKtd$hwy[mpgKtd$class=="minivan"]) shapiro.test(mpgKtd$hwy[mpgKtd$class=="pickup"]) shapiro.test(mpgKtd$hwy[mpgKtd$class=="subcompact"]) shapiro.test(mpgKtd$hwy[mpgKtd$class=="suv"]) #Create a table of midsize records mpgKtdMid<- mpgKtd[mpgKtd$class=="midsize",] #Q-Q Test of midsize class qqhwyMid<- ggplot(data = mpgKtdMid, mapping = aes(sample = hwy)) + stat_qq_band() + stat_qq_line() + stat_qq_point() + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("Q-Q Test of the Hiqhway Variable - Midsize") + theme(legend.position = "none") qqhwyMid #Wrap subset midsize on model and fl qqhwyMidFac<- qqhwyMid + facet_wrap(~model + fl) + theme(legend.position = "none") qqhwyMidFac #Grid subset midsize on model and year qqhwyMidGrd<- qqhwyMid + facet_grid(model~year) + stat_qq_point() + theme(legend.position = "none") qqhwyMidGrd ##Cty variable qqcty <- ggplot(data = mpgKtd, mapping = aes(sample = cty)) + stat_qq_band() + stat_qq_line() + stat_qq_point(aes(color=class)) + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("City Variable") qqcty #Test cty variable for normalicy ##Small p-value indicates is not normal. shapiro.test(mpgKtd$cty) qqctyFac<- qqcty + facet_wrap(~class) + theme(legend.position = "none") qqctyFac ##Displ variable qqdispl <- ggplot(mpgKtd, mapping = aes(sample = displ)) + stat_qq_band() + stat_qq_line() + stat_qq_point(aes(color=class)) + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("Displacement") qqdispl #Test displ variable for normalicy ##Small p-value indicates is not normal. shapiro.test(mpgKtd$displ) qqdisplFac<- qqdispl + facet_wrap(~class) + theme(legend.position = "none") qqdisplFac ##Cyl variable qqcyl<- ggplot(mpgKtd, mapping = aes(sample = cyl)) + stat_qq_band() + stat_qq_line() + stat_qq_point(aes(color=class)) + labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + ggtitle("Cylinders") qqcyl #Test cyl variable for normalicy ##Small p-value indicates is not normal. shapiro.test(mpgKtd$cyl) qqcylFac<- qqdispl + facet_wrap(~class) + theme(legend.position = "none") qqcylFac #################################################### #5.2.3: INSPECT CORRELATION BETWEEN VARIABLES #################################################### #Visualize correlation ##Pairs of variables pairs.panels(mpgKtd[c("hwy", "cty", "cyl", "displ", "drv", "model", "trans")]) ##Return correlation tables using person and spearman. ##create a table for correlation table corMat<- as.matrix(mpgKtd[,c("hwy","cty", "displ", "cyl")]) cor(corMat, method = "pearson") cor(corMat, method = "spearman") ##P-value and confidence interval to Pearson correlation cor.test(mpgKtd$hwy, mpgKtd$displ, method="pearson") cor.test(mpgKtd$hwy, mpgKtd$cty, method="pearson") cor.test(mpgKtd$hwy, mpgKtd$cyl, method="pearson") cor.test(mpgKtd$cty, mpgKtd$displ, method="pearson") cor.test(mpgKtd$cty, mpgKtd$cyl, method="pearson") ##R squared tables cor(corMat, method = "pearson")^2 cor(corMat, method = "spearman")^2 #Partial correlation analysis pcHwyDispl<- pcor(c("hwy", "displ", "cyl"), var(corMat)) pcHwyDispl pcHwyDispl^2 pcor.test(pcHwyDispl, 1, 234) pcHwyCyl<- pcor(c("hwy", "cyl", "displ" ), var(corMat)) pcHwyCyl pcHwyCyl^2 pcor.test(pcHwyCyl, 1, 234) ##Corrlation of categorical variable to numeric variables ##Single out correlation of point to serial #View correlation between drive categories to hwy mileage #Correlation of front drv and hwy with rear as base mpgfr<- mpgKtd[(mpgKtd$drv=="f" | mpgKtd$drv=="r"),] mpgfr$numdr<- ifelse(mpgfr$drv=="r", 0, 1) cor.test(mpgfr$hwy, mpgfr$numdr) #Four wheel drive to hwy with rear as base mpg4r<- mpgKtd[(mpgKtd$drv=="4" | mpgKtd$drv=="r"),] mpg4r$numd4<- ifelse(mpg4r$drv=="r", 0, 1) cor.test(mpg4r$hwy, mpg4r$numd4) #Left panel #Compare data as scatter and subsets #With subset on points scatSubPt<- ggplot(mpgKtd, aes(x = displ, y = hwy)) + geom_point(aes(color = drv, shape = class)) + geom_smooth() + scale_shape_manual(values=seq(0,6)) + labs(x = "Displacement", y = "Mileage") + ggtitle("Subset on points-class & drv") + theme(legend.position = c(.5, .8), legend.box = "horizontal") scatSubPt #Right panel #With subset on points and facets scatSubPrFc<- ggplot(mpgKtd, aes(displ, hwy)) + geom_point(aes(color = drv, shape = class)) + geom_smooth(aes(displ, hwy)) + scale_shape_manual(values=seq(0,6)) + facet_wrap(~cyl) + labs(x = "Displacement", y = "Mileage") + ggtitle("Subset on wrap - cyl") + theme(legend.position = "none") scatSubPrFc ##Alernate code to the chart scatSubPrFcAlt<- scatSubPrFc + facet_wrap(~cyl) scatSubPrFcAlt #Side-by-side #Plot scat and scatFac scat1x2<- ggarrange(scatSubPt, scatSubPrFc, ncol = 2, nrow = 1) scat1x2 #Multiple base plots scat2Chts<- ggplot(mpgKtd, aes(displ, hwy)) + geom_point(aes(color = "hwy")) + geom_smooth(aes(color = "hwy"), se=TRUE ) + geom_point(aes(displ, cty, color = "cty")) + geom_smooth(aes(displ, cty, color = "cty"), se=TRUE) + labs(x = "Displacement", y = "Mileage") + ggtitle("Mileage vs Displacement") + theme(legend.position = c(.95, .9), legend.title = element_blank()) scat2Chts #Methods for Over laping and plotting #Left panel ##Count in area chart scatAreaCnt<- ggplot(mpgKtd, aes(displ, hwy, color = drv)) + geom_point() + geom_count() + theme(legend.position = c(.95, .8)) scatAreaCnt #Right panel #Hex chart scatHex<- ggplot(mpgKtd, aes(displ, hwy, color = drv)) + geom_hex() + theme(legend.position = c(.95, .8)) scatHex #Side-by-side #Plot count and hex chart overPlot1x2<- ggarrange(scatAreaCnt, scatHex, ncol = 2, nrow = 1) overPlot1x2 #################################################### #5.2.4: INSPECT CENTRALITY AND SPREAD #################################################### #Left panel #Boxplot, mean and points boxPlt<- ggplot(mpgKtd, aes(drv, cty, color = drv)) + geom_point() + geom_boxplot(size = 1) + geom_jitter() + geom_point(stat="summary", fun="mean", color = "black", size = 4, shape = 17) + theme(legend.position = c(.95, .9)) boxPlt #Right panel #Violin, mean and points vioPlt<- ggplot(mpgKtd, aes(drv, cty, color = drv)) + geom_point() + geom_violin(size = 1) + geom_jitter() + geom_point(stat="summary", fun="mean", color = "black", size = 4, shape = 17) + theme(legend.position = c(.95, .9)) vioPlt #Side-by-side #Plot Boxplot and Violin boxVio1x2<- ggarrange(boxPlt, vioPlt, ncol = 2, nrow = 1) boxVio1x2 #Boxplot with continuous variable as group variable boxContin<- ggplot(mpgKtd, aes(displ, cty, color = drv)) + geom_point() + geom_boxplot(aes(group = cut_width(displ, .1))) + theme(legend.position = c(.95, .9)) boxContin #Histogram and polygon ##Histogram and polygon Without subsetting hstPly<- ggplot(mpgKtd, aes(hwy)) + geom_histogram(bins=10, alpha = .4)+ geom_freqpoly(bins=10, size = 1) hstPly #Left panel ##Create subsetting perspective hstSubPt<- ggplot(mpgKtd, aes(hwy, fill=drv)) + geom_histogram(bins=10) + theme(legend.position = c(.95, .9)) hstSubPt #Right panel hstSubWrp<- hstSubPt + facet_wrap(~drv, ncol=1) hstSubWrp #Side-by-side hstSubPtWrp<- ggarrange(hstSubPt, hstSubWrp, ncol = 2, nrow = 1) hstSubPtWrp ##Polygon subset on drive plySub<- ggplot(mpgKtd, aes(hwy, color=drv)) + geom_freqpoly(bins=10, size = 1.25) plySub #Left panel ##View hist and polys on a density basis hstDen<- ggplot(mpgKtd, aes(hwy, fill=drv)) + geom_histogram(aes(y = ..density..), bins = 10, alpha = .4) + theme(legend.position = c(.95, .9)) hstDen #Right panel plyDen<- ggplot(mpgKtd, aes(hwy, fill=drv), bins = 10) + geom_density( alpha = 0.2) + xlim(0, 50) + theme(legend.position = c(.95, .9)) plyDen #Side-by-side denHstPly<- ggarrange(hstDen, plyDen, ncol = 2, nrow = 1) denHstPly #################################################### #5.2.5: INSPECT CATEGORICAL VARIABLES #################################################### #Left panel ##Barchart upon counts barCnt<- ggplot(mpgKtd, aes(manufacturer, fill = drv)) + geom_bar() + theme(axis.text.x = element_text(face = "bold", color = "black", size = 10, angle = 90), axis.text.y = element_text(face = "bold", color = "black", size = 10, angle = 0)) + theme(legend.position = c(.5, .8)) + ggtitle("Count stacked") barCnt #Right panel #Barchart with dodge barCntDod<- ggplot(mpgKtd, aes(manufacturer, fill = drv)) + geom_bar(position = "dodge", width = .7) + theme(axis.text.x = element_text(face = "bold", color = "black", size = 10, angle = 90), axis.text.y = element_text(face = "bold", color = "black", size = 10, angle = 90)) + theme(legend.position = c(.5, .8)) + ggtitle("Count dodged") barCntDod #Side-by-side #Plot 1x2 charts barCntDod1x2<- ggarrange(barCnt, barCntDod, ncol = 2, nrow = 1) barCntDod1x2 #Barchart upon mean barAvg<- ggplot(mpgKtd, aes(manufacturer, hwy, fill = manufacturer)) + geom_bar(stat="summary", fun="mean") + theme(axis.text.x = element_text(face = "bold", color = "black", size = 10, angle = 90), axis.text.y = element_text(face = "bold", color = "black", size = 10, angle = 0), legend.position = "none") + ggtitle("Average of Miles ") barAvg #################################################### #5.2.6: INSPECT VARIABLES OVER TIME #################################################### ##Download data set economics<- read.xlsx( "C:\\\\DataBookAssetMgtXlsx.xlsx", sheetName="EconDataSet", header=TRUE) str(economics) #Left panel #Time series of unemployement serUE<- ggplot(economics, aes(date, unemploy/pop)) + geom_line() + ggtitle("Unemployement") serUE #Right panel #Time series median time unemployed serTimeUE<- ggplot(economics, aes(date, uempmed)) + geom_line() + ggtitle("Median Time Unemployed") serTimeUE #Side-by-side #Plot 2 charts series1x2<- ggarrange(serUE, serTimeUE, ncol = 2, nrow = 1) series1x2 #Left panel #Scatter plot of two charts. crossUeTm<- ggplot(economics, aes(unemploy/pop, uempmed)) + geom_point() + ggtitle("Cross plot of two variables") crossUeTm #Right panel pthPlt<- ggplot(economics, aes(unemploy/pop, uempmed)) + geom_path(color="grey50") + geom_point(aes(color = DateYr)) + theme(legend.position = c(.1, .8)) + ggtitle("Cross plot with path") pthPlt #Side-by-side #Plot 2 charts crosPath1x2<- ggarrange(crossUeTm, pthPlt, ncol = 2, nrow = 1) crosPath1x2 #Load the R data set data(Oxboys) head(Oxboys) #Lines per group with overall linear fit line grpLines<- ggplot(Oxboys, aes(age, height)) + geom_line(aes(group = Subject, color = Subject), size = 1) + geom_smooth(method = "lm", size = 3, se = FALSE, color = "black", linetype = "dashed") grpLines #Plot by group and boz and mean at occasion grpBox<- ggplot(Oxboys, aes(Occasion, height)) + geom_boxplot() + geom_line(aes(group = Subject, color = Subject), size = 1) + geom_point(stat="summary", fun="mean", color = "black", size = 4, shape = 18) grpBox #################################################### #5.3: SAVE AND DISSIMINATE #################################################### #Save to a pdf file #Turn dev on, export to pdf file pdf("C:\\\\Dissim.pdf", paper="USr") print(list(serUE, serTimeUE, pthPlt)) #Turn device off dev.off()