# Basic Linear Regression with Compositional Explanatory Variables # ================================================================ # Last updated: 21/03/2019 DM # # If used please reference/acknowledge the OpenCoDa website. library(compositions) library(zCompositions) library(robCompositions) library(ggplot2) library(ggtern) library(mvtnorm) # Generate junk data (replace with your own data) # Generate Composition set.seed(1009) z <- matrix(rnorm(300)+3,ncol=3) z[,2] <- z[,2] x <- exp(data.frame(z,1)) x <- x/rowSums(x) colnames(x) <- c("Sleep","SB","LIPA","MVPA") x[c(1,3), 4] <- 0 # Insert zeroes x[ 2,c(3,4)] <- NA # Insert n/a's x[12,1] <- NA # Insert n/a # Provide some covariates and outcome set.seed(101) age <- round(rnorm(100,50,10),1) sex <- sample(2,100,replace=T) bmi <- round(z%*% c(0.6,1.6,0.1) + 0.2*(age-50) -3*(sex-1) +20,1) sex <- as.factor(sex) # Combine data <- data.frame(x,age,sex,bmi) data # Separate composition CODAvars <- c("Sleep","SB","LIPA","MVPA") CODA.data <- data[,CODAvars] # We need to eliminate zeroes and N/a's from the raw data to carry out CODA analysis # We assume data is missing at random (MAR), i.e. can infer from observed values # We assume zeroes are observed due to detection limits, e.g. 1 minute epochs and use # imputation conditioned on this detection limit. # We use lrEM algorithm to minimise the distortion introduced by imputation # Specify detection limits dl_hr <- 1/24 dl_min <- dl_hr/60 dl <- c(dl_hr,dl_min,dl_min,dl_min) # Sleep to nearest hour, waking day to nearest minute # Impute zeroes and missing values CODA.data <- lrEMplus(CODA.data,dl,ini.cov="multRepl") # Function for geometric mean of a vector gm <- function(x) { exp(mean(log(x))) } # Inspect the compositional data via clr biplot clrx <-log(CODA.data/apply(CODA.data,1,gm)) colnames(clrx) <- paste0("z_",colnames(clrx)) clrx.pca <- prcomp(clrx) summary(clrx.pca) # 83.39% of observed variance included in biplot biplot(clrx.pca) # Create ilr coordinates for linear regression using robCompositions package # # We are using a subset of binary partitions based on pivot coordinate approach # # ("MVPA","LIPA","Sleep","SB") # | # --------------------------------- # | | # | ("LIPA","Sleep","SB") # | | # | --------------------- # | | | # | | ("Sleep","SB") # | | | # | | ----------------------- # | | | | # "MVPA" "LIPA" "Sleep" "SB" partitionOrder <- c("MVPA","LIPA","Sleep","SB") # Indicate order to split off components CODA.data1 <- CODA.data[,partitionOrder] ilr1 <- pivotCoord(CODA.data1,1) regdat <- data.frame(data[,-c(1:4)],ilr1) # Remove original compositional variables # Setup linear model model1.lm <-lm(bmi ~ ., data=regdat) # Have setup dataframe to only include used variables # Can specify in formula instead in usual way summary(model1.lm) # Examine model coefficients and p-values drop1(model1.lm,test="Chisq") # Examine impact of dropping coefficients (Equivalent to Likelihood Ratio Test) #Examine residuals par(mfrow=c(2,2)) # Display graphs 2x2 grid plot(model1.lm) par(mfrow=c(1,1)) # Reset display # You can also explore different rotations easily # If you want to look at the full set of ilr coordinates then you can alter partitionOrder partitionOrderB <- c("LIPA","MVPA","Sleep","SB") # Indicate order to split off components CODA.data1B <- CODA.data[,partitionOrderB] ilr1B <- pivotCoord(CODA.data1B,1) regdatB <- data.frame(data[,-c(1:4)],ilr1B) # Remove original compositional variables model1B.lm <-lm(bmi ~ ., data=regdatB) # Have setup dataframe to only include used variables summary(model1B.lm) # Examine model coefficients and p-values # If you're only interested in the pivot (first) coordinate for each behavior then # pivotCoord can do this very easily ilr1 <- pivotCoord(CODA.data1,1) # MVPA ilr2 <- pivotCoord(CODA.data1,2) # LIPA ilr3 <- pivotCoord(CODA.data1,3) # Sleep ilr4 <- pivotCoord(CODA.data1,4) # SB regdat1 <- data.frame(data[,-c(1:4)],ilr1) # Remove original compositional variables regdat2 <- data.frame(data[,-c(1:4)],ilr2) # Remove original compositional variables regdat3 <- data.frame(data[,-c(1:4)],ilr3) # Remove original compositional variables regdat4 <- data.frame(data[,-c(1:4)],ilr4) # Remove original compositional variables # Setup linear model model1.lm <-lm(bmi ~ ., data=regdat1) # Have setup dataframe to only include used variables model2.lm <-lm(bmi ~ ., data=regdat2) # Have setup dataframe to only include used variables model3.lm <-lm(bmi ~ ., data=regdat3) # Have setup dataframe to only include used variables model4.lm <-lm(bmi ~ ., data=regdat4) # Have setup dataframe to only include used variables summary(model1.lm) # Examine model coefficients and p-values summary(model2.lm) # Examine model coefficients and p-values summary(model3.lm) # Examine model coefficients and p-values summary(model4.lm) # Examine model coefficients and p-values # If you want to produce a ternary plot, we usually find it easiest to setup # an artificial (equally spaced) dataset, predict the outcome based on our model, # then apply the plotting routine to this artificial dataset to create a heatmap. # Setup illustration data mu <- apply(ilr1,2,mean) # Center Sigma <- cov(ilr1) # Covariance rand1 <- rmvnorm(1000,mu,Sigma) min1 <- apply(ilr1,2,min) # Quick estimate of max max1 <- apply(ilr1,2,max) # Quick estimate of min #Setup a rectangular grid within these limits len1 <- 40 xx1 <- rand1[rep(1,len1+1),] for (i in 1:len1+1) { xx1[i,] <- min1 + (max1-min1)*(i-1)/len1 } z1 <- xx1[,1] z2 <- xx1[,2] z3 <- xx1[,3] zz1 <- expand.grid(z1,z2,z3) # Eliminate data outside ellipse centred on CoDa mean based on observed covariance D2 <- apply(zz1,1,mahalanobis,center=mu,cov=Sigma) # This line can be slow if len1 is big!! q95 <- qchisq(0.95,df=3) xx2 <- zz1[D2