## DEMONSTRATION OF "R" IN ACTION: CORRELATION ANALYSIS ## SOURCE OF EXAMPLE: DISCOVERING STATISTICS USING R, CHAPTER 6: CORRELATION. FIELD and MILES, 2012 ####################################################################################################### #### SET UP THE SESSION PACKAGES ############################################### #################################################################################################### ## R is built of packages but not all are downloaded at the installation of R. ## Instead, packages are installed as neede initially and activivated by the library() function ## If not installed on the computer, install the following packages. ## Hash has been placed so that will not automatically run if the entire script is run. ## To install, highlight the line excluding the hash #install.packages("ggm") #install.packages("polycor") #install.packages("Hmisc") #install.packages("ggplot2") #install.packages("rlm") #install.packages(MASS) ## If booting up the script for current session, execute each of the packages. ## Hash has been placed so that will not automatically run if the entire script is run. ## To install, highlight the line excluding the hash #library(ggm) #library(Hmisc) #library(polycor) #library(ggplot2) #library(rlm) #library(MASS) #### FUNCTION TO CREATE PAIRS PLOT--ONLY RUN INITIALLY ############################################# #################################################################################################### ## Function built for the scaterplot matrix--Only need to run for intial run. ## to form histogram on the diagnal panel.hist <- function(x, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...) } ## Function built for the scaterplot matrix--Only need for initial run ## TO put (absolute) correlations on the upper panels, ## with size proportional to the correlations. panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits = digits)[1] txt <- paste0(prefix, txt) if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor * r) } ##End of the function #### CREATE A GO-BY CASE FROM A TEXT OR PAPER CASE WHILE UNDERSTANGING THE PRINCIPLS ############### #################################################################################################### ## Data to the case is provided by the reference text, maded available to the demo as a CSV file ## File has been downloaded to the user's directory and will now be pulled int the demom ## We read the csv file into the analysis by declaring the path to the file. ## Notice that all is the same, except the slash line must be converted to forward slash ## For an xlsx file see the read.xlsx() function. ##Read the csv file into R examData <- read.csv("C:/Users/user pc/OneDrive - DataAnalytic 1/TrainR_Analytics/Correlation/AnxietyParCorr.csv", header=TRUE, sep=",") ## Look at the data with summary(), describe(), str() and head() head(examData) ##Shows first five cases summary(examData) ##Descriptive statistics str(examData) ##Shows make up of the variables describe(examData) ##Table of descriptive statistics for each variable ## Pairs() creates scatter plot matrix--see graphics window ##Must have run the function at end of script pairs(Exam ~ Anxiety + Revise, data = examData, main = "Scatterplot Matrix: Exam", diag.panel = panel.hist, lower.panel = panel.smooth, upper.panel = panel.cor) #### We have assumed normal distribution in the data ################## ##Test data with the qqnorm() function ##The points must lie along the qqline par(mfrow = c(2, 2)) # 2 x 2 pictures on one plot anx<- qqnorm(examData$Anxiety, main = "Q-Q Anxiety "); qqline(examData$Anxiety) rev<- qqnorm(examData$Revise, main = "Q-Q Revise "); qqline(examData$Revise) exm<- qqnorm(examData$Exam, main = "Q-Q Exam "); qqline(examData$Exam) par(mfrow = c(1, 1)) #returns to 1 x 1 pictures on one plot ## Create a dataframe from the three variables of interest. examData2<- examData[,c("Exam","Anxiety","Revise")] head(examData2) ## Generate table of correlation of the dataframe with the function cor() ## cor(x,y, use="string", method="correlation type") ## use is choice for treating empty cells, method is pearson, spearman and kedall ## Args beyond basic are execluded due to nonmempty cells ## help(function) will take to explanation of the called function. EXAMPLE help(cor) # help(cor) cor(examData2, method="pearson") cor(examData2, method="spearman") ## We want to view the p-value to the correlations ## We must use another function because generates p-values--we could have chosen initially ## We will use rcorr() of the Hmisc() package. ## the function requires that data must be presented as data matrix see help(matrix) examMatrix<- as.matrix(examData[,c("Exam","Anxiety","Revise")]) head(examMatrix) # help(rcorr) rcorr(examMatrix, type = "pearson") rcorr(examMatrix, type = "spearman") ## View the conficent intervals to what has shown to be significant ## See help(cor.test)for details of the function ## Must view for each combination #help(cor.test) cor.test(examData$Anxiety, examData$Exam, method="pearson") cor.test(examData$Anxiety, examData$Exam, method="spearman") cor.test(examData$Anxiety, examData$Revise, method="pearson") cor.test(examData$Anxiety, examData$Revise, method="spearman") cor.test(examData$Revise, examData$Exam, method="pearson") cor.test(examData$Revise, examData$Exam, method="spearman") ## Measure the extent that the variance in on variable is shared by another--R squared ##Square the previous cor() cor(examData2, method="pearson")^2 cor(examData2, method="spearman")^2 ## Partial correlation ## Use the pcor() and pcor.test() OF the ggm package ## Must be used for pairs, controled variables can be one or multiple cor(examData2, method="pearson") #### Partial Exam to Anxiety pcExAn<- pcor(c("Exam", "Anxiety", "Revise"), var(examData2)) pcExAn pcExAn^2 ## The second arg of pcor.test is number of controled variables, ## the third is number of cases pcor.test(pcExAn, 1, 103) #### Partial Exam to Revise pcExRe<- pcor(c("Exam", "Revise", "Anxiety"), var(examData2)) pcExRe pcExRe^2 pcor.test(pcExRe, 1, 103) ##Conduct linear model ## Test for interaction between predictor variables examPredIntr <- lm(Exam ~ Anxiety * Revise, examData) summary(examPredIntr) ## Model with interaction removed examPred <- lm(Exam ~ Anxiety + Revise, examData) summary(examPred) confint(examPred) ## Run a robust linear model with rlm() function, and MASS and rlm packages ################### rlm.examlm <- rlm(Exam ~ Anxiety + Revise, data = examData) summary(rlm.examlm) ##call the model run by the previous line # help(rlm)