Motivation

Today we explore multivariate implementations of linear models (anova, regression, etc.). First however, let’s read in some data and examine a few of its properties:

library(knitr)
library(RRPP) 
library(mvabund)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
bumpus <- read.csv("Data/bumpus.csv",header=T)
bumpus.data <- log(as.matrix(bumpus[,(5:13)])) # matrix of linear measurements
sex <- as.factor(bumpus[,2])
surv <- as.factor(bumpus[,4])
SexBySurv <- paste(sex,surv)
TotalLength <- bumpus.data[,1]
Y <- bumpus.data[,-1]
mydat <- rrpp.data.frame(Y=Y,sex=sex,surv=surv,TotalLength=TotalLength,SexBySurv = SexBySurv)

Let’s do some simple exploratory descriptions of the data.

#Describe the data
(100*apply(bumpus.data,2,sd)) /apply(bumpus.data,2,mean)  # CV: Coefficient of Variation
##        TL        AE        WT       BHL        HL        FL       TTL        SW 
## 0.4406466 0.4112603 1.7600940 0.6465824 1.0884703 1.1748841 1.0818650 0.9140832 
##       SKL 
## 1.5486045
cor.bumpus<-cor(bumpus.data)
cor.bumpus
##            TL        AE        WT       BHL        HL        FL       TTL
## TL  1.0000000 0.6932777 0.5867330 0.4710276 0.4859102 0.4441295 0.3784330
## AE  0.6932777 1.0000000 0.5739660 0.5035656 0.6794433 0.5778571 0.5336809
## WT  0.5867330 0.5739660 1.0000000 0.5243470 0.5201937 0.4432527 0.4548922
## BHL 0.4710276 0.5035656 0.5243470 1.0000000 0.6234217 0.6163699 0.5853149
## HL  0.4859102 0.6794433 0.5201937 0.6234217 1.0000000 0.8215338 0.7486924
## FL  0.4441295 0.5778571 0.4432527 0.6163699 0.8215338 1.0000000 0.8107807
## TTL 0.3784330 0.5336809 0.4548922 0.5853149 0.7486924 0.8107807 1.0000000
## SW  0.4381097 0.4364807 0.4702211 0.5347691 0.5121106 0.5229477 0.4609773
## SKL 0.5052307 0.5864533 0.5192465 0.4944828 0.5522638 0.4571827 0.3887796
##            SW       SKL
## TL  0.4381097 0.5052307
## AE  0.4364807 0.5864533
## WT  0.4702211 0.5192465
## BHL 0.5347691 0.4944828
## HL  0.5121106 0.5522638
## FL  0.5229477 0.4571827
## TTL 0.4609773 0.3887796
## SW  1.0000000 0.3894656
## SKL 0.3894656 1.0000000
pairs(bumpus.data)

vcv.bumpus<-var(bumpus.data)
vcv.bumpus
##               TL           AE           WT          BHL           HL
## TL  0.0004995189 0.0003505933 0.0007473663 0.0002349790 0.0003454281
## AE  0.0003505933 0.0005119657 0.0007401567 0.0002543216 0.0004889894
## WT  0.0007473663 0.0007401567 0.0032481351 0.0006670257 0.0009429918
## BHL 0.0002349790 0.0002543216 0.0006670257 0.0004982111 0.0004426031
## HL  0.0003454281 0.0004889894 0.0009429918 0.0004426031 0.0010116995
## FL  0.0003377309 0.0004448629 0.0008595154 0.0004680943 0.0008890710
## TTL 0.0003074033 0.0004388798 0.0009422562 0.0004748318 0.0008655112
## SW  0.0002441446 0.0002462486 0.0006682014 0.0002976194 0.0004061418
## SKL 0.0005349521 0.0006286416 0.0014019736 0.0005228861 0.0008321882
##               FL          TTL           SW          SKL
## TL  0.0003377309 0.0003074033 0.0002441446 0.0005349521
## AE  0.0004448629 0.0004388798 0.0002462486 0.0006286416
## WT  0.0008595154 0.0009422562 0.0006682014 0.0014019736
## BHL 0.0004680943 0.0004748318 0.0002976194 0.0005228861
## HL  0.0008890710 0.0008655112 0.0004061418 0.0008321882
## FL  0.0011576319 0.0010026103 0.0004436410 0.0007369266
## TTL 0.0010026103 0.0013209518 0.0004177450 0.0006694162
## SW  0.0004436410 0.0004177450 0.0006216936 0.0004600516
## SKL 0.0007369266 0.0006694162 0.0004600516 0.0022443908
var(scale(bumpus.data))
##            TL        AE        WT       BHL        HL        FL       TTL
## TL  1.0000000 0.6932777 0.5867330 0.4710276 0.4859102 0.4441295 0.3784330
## AE  0.6932777 1.0000000 0.5739660 0.5035656 0.6794433 0.5778571 0.5336809
## WT  0.5867330 0.5739660 1.0000000 0.5243470 0.5201937 0.4432527 0.4548922
## BHL 0.4710276 0.5035656 0.5243470 1.0000000 0.6234217 0.6163699 0.5853149
## HL  0.4859102 0.6794433 0.5201937 0.6234217 1.0000000 0.8215338 0.7486924
## FL  0.4441295 0.5778571 0.4432527 0.6163699 0.8215338 1.0000000 0.8107807
## TTL 0.3784330 0.5336809 0.4548922 0.5853149 0.7486924 0.8107807 1.0000000
## SW  0.4381097 0.4364807 0.4702211 0.5347691 0.5121106 0.5229477 0.4609773
## SKL 0.5052307 0.5864533 0.5192465 0.4944828 0.5522638 0.4571827 0.3887796
##            SW       SKL
## TL  0.4381097 0.5052307
## AE  0.4364807 0.5864533
## WT  0.4702211 0.5192465
## BHL 0.5347691 0.4944828
## HL  0.5121106 0.5522638
## FL  0.5229477 0.4571827
## TTL 0.4609773 0.3887796
## SW  1.0000000 0.3894656
## SKL 0.3894656 1.0000000

1: Single-Factor MANOVA

Multivariate anova may be accompished in the base package of R using ‘lm’. However, one must specify that it is a manova; otherwise it is assumed that a series of independent anovas is desired. Alternatively, MANOVA via RRPP may be performed in the RRPP package.

model1 <- lm(bumpus.data~sex)
summary(model1) #yields a set of univariate analyses
## Response TL :
## 
## Call:
## lm(formula = TL ~ sex)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.047193 -0.015036 -0.002457  0.016119  0.043741 
## 
## Coefficients:
##             Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 5.062204   0.003022 1674.957  < 2e-16 ***
## sexm        0.015427   0.003779    4.083 7.62e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02116 on 134 degrees of freedom
## Multiple R-squared:  0.1106, Adjusted R-squared:  0.104 
## F-statistic: 16.67 on 1 and 134 DF,  p-value: 7.617e-05
## 
## 
## Response AE :
## 
## Call:
## lm(formula = AE ~ sex)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.047855 -0.013544 -0.001324  0.015324  0.043495 
## 
## Coefficients:
##             Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 5.485934   0.002756 1990.568  < 2e-16 ***
## sexm        0.024778   0.003446    7.191 4.09e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01929 on 134 degrees of freedom
## Multiple R-squared:  0.2784, Adjusted R-squared:  0.2731 
## F-statistic: 51.71 on 1 and 134 DF,  p-value: 4.089e-11
## 
## 
## Response WT :
## 
## Call:
## lm(formula = WT ~ sex)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.104927 -0.040364 -0.002589  0.036700  0.199309 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.218418   0.007893 407.771  < 2e-16 ***
## sexm        0.030662   0.009868   3.107  0.00231 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05525 on 134 degrees of freedom
## Multiple R-squared:  0.06721,    Adjusted R-squared:  0.06025 
## F-statistic: 9.655 on 1 and 134 DF,  p-value: 0.002306
## 
## 
## Response BHL :
## 
## Call:
## lm(formula = BHL ~ sex)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.059680 -0.013926  0.001607  0.015340  0.060176 
## 
## Coefficients:
##             Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 3.448380   0.003175 1086.008   <2e-16 ***
## sexm        0.005808   0.003970    1.463    0.146    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02223 on 134 degrees of freedom
## Multiple R-squared:  0.01572,    Adjusted R-squared:  0.008377 
## F-statistic:  2.14 on 1 and 134 DF,  p-value: 0.1458
## 
## 
## Response HL :
## 
## Call:
## lm(formula = HL ~ sex)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.108253 -0.018507  0.003394  0.020429  0.070796 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.915492   0.004503 647.458   <2e-16 ***
## sexm        0.010478   0.005630   1.861   0.0649 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03152 on 134 degrees of freedom
## Multiple R-squared:  0.0252, Adjusted R-squared:  0.01792 
## F-statistic: 3.464 on 1 and 134 DF,  p-value: 0.06492
## 
## 
## Response FL :
## 
## Call:
## lm(formula = FL ~ sex)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.088257 -0.015131 -0.000352  0.027031  0.075104 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.894376   0.004876 593.629   <2e-16 ***
## sexm        0.002451   0.006096   0.402    0.688    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03413 on 134 degrees of freedom
## Multiple R-squared:  0.001205,   Adjusted R-squared:  -0.006248 
## F-statistic: 0.1617 on 1 and 134 DF,  p-value: 0.6882
## 
## 
## Response TTL :
## 
## Call:
## lm(formula = TTL ~ sex)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.112349 -0.018086  0.000696  0.024309  0.083726 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.362005   0.005204  646.01   <2e-16 ***
## sexm        -0.003967   0.006507   -0.61    0.543    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03643 on 134 degrees of freedom
## Multiple R-squared:  0.002767,   Adjusted R-squared:  -0.004675 
## F-statistic: 0.3718 on 1 and 134 DF,  p-value: 0.5431
## 
## 
## Response SW :
## 
## Call:
## lm(formula = SW ~ sex)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.086408 -0.018190 -0.000752  0.015316  0.063325 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.725137   0.003564 764.588   <2e-16 ***
## sexm        0.004065   0.004456   0.912    0.363    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02495 on 134 degrees of freedom
## Multiple R-squared:  0.00617,    Adjusted R-squared:  -0.001246 
## F-statistic: 0.8319 on 1 and 134 DF,  p-value: 0.3634
## 
## 
## Response SKL :
## 
## Call:
## lm(formula = SKL ~ sex)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.109789 -0.026668 -0.001033  0.031965  0.106245 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.035292   0.006283 483.108  < 2e-16 ***
## sexm        0.037383   0.007855   4.759 4.97e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04398 on 134 degrees of freedom
## Multiple R-squared:  0.1446, Adjusted R-squared:  0.1382 
## F-statistic: 22.65 on 1 and 134 DF,  p-value: 4.971e-06
summary(manova(model1)) #does multivariate test (using Pillai's)
##            Df  Pillai approx F num Df den Df    Pr(>F)    
## sex         1 0.46652   12.243      9    126 9.166e-14 ***
## Residuals 134                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova(model1),test="Wilks")    #does multivariate test (using Wilks)
##            Df   Wilks approx F num Df den Df    Pr(>F)    
## sex         1 0.53348   12.243      9    126 9.166e-14 ***
## Residuals 134                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##### MANOVA via RRPP
model.rrpp <- lm.rrpp(Y~sex,data = mydat, print.progress = FALSE)
anova(model.rrpp)
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Ordinary Least Squares 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##            Df      SS       MS     Rsq      F     Z Pr(>F)   
## sex         1 0.09822 0.098219 0.06854 9.8604 3.336  0.001 **
## Residuals 134 1.33476 0.009961 0.93146                       
## Total     135 1.43298                                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = Y ~ sex, data = mydat, print.progress = FALSE)
plot(model.rrpp, type = "PC", pch=21, bg = sex)  #PC PLOT!
legend("topright", levels(sex), pch = 21, pt.bg = 1:4)

One may ask why permutation-based approaches are necessary. The following is a simple example for high-dimensional data, showing that when standard, parametric-based methods break down, RRPP can still evaluate statistical models.

##When parametric methods break down
Ynew <- matrix(rnorm(10000), nrow=100) #100 x 100 matrix: N=p
gp <- gl(2,50)
summary(manova(lm(Ynew~gp)))  #parametric algebra cannot be completed
## Error in summary.manova(manova(lm(Ynew ~ gp))): residuals have rank 98 < 100
anova(lm.rrpp(Ynew~gp,print.progress = FALSE))  #NO problem with RRPP!
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Ordinary Least Squares 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##           Df     SS      MS     Rsq      F      Z Pr(>F)  
## gp         1  123.0 123.016 0.01254 1.2447 1.6447  0.043 *
## Residuals 98 9685.5  98.832 0.98746                       
## Total     99 9808.5                                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = Ynew ~ gp, print.progress = FALSE)

2: Factorial MANOVA

Factorial manova may be accomplished in a similar manner. Alternatively, one may evaluate MANOVA via RRPP. Here one has the option of permuting the raw data or residuals from the reduced models. Simulations have shown that RRPP (reduced residual permutation procedure) displays higher statistical power for factorial designs and is thus preferred. It also permutes the correct exchangeable units under each null hypothesis (see discussion in lecture).

Finally, we have an example of performing pairwise comparisons. These are based on Euclidean distances between group means. Recall that univariate pairwise comparisons are based on the numerator of the T-test in some manner (typically |Y1-Y2|). This is a univariate representation of Euclidean distance, so the approach here is the multivariate equivalent.

#Factorial MANOVA
model2<-lm(bumpus.data~sex*surv)
summary(manova(model2))
##            Df  Pillai approx F num Df den Df    Pr(>F)    
## sex         1 0.47143  12.2882      9    124 9.520e-14 ***
## surv        1 0.34256   7.1788      9    124 2.442e-08 ***
## sex:surv    1 0.09718   1.4831      9    124    0.1613    
## Residuals 132                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Factorial MANOVA via RRPP
model2.rrpp <- lm.rrpp(Y~sex*surv,data = mydat, print.progress = FALSE)
anova(model2.rrpp)
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Ordinary Least Squares 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##            Df      SS       MS     Rsq       F       Z Pr(>F)   
## sex         1 0.09822 0.098219 0.06854 10.0366  3.3622  0.001 **
## surv        1 0.03995 0.039948 0.02788  4.0821  2.0450  0.017 * 
## sex:surv    1 0.00305 0.003052 0.00213  0.3118 -0.9125  0.808   
## Residuals 132 1.29176 0.009786 0.90145                          
## Total     135 1.43298                                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = Y ~ sex * surv, data = mydat, print.progress = FALSE)
groups <- interaction(mydat$sex, mydat$surv)
plot(model2.rrpp, type = "PC", pch=21, bg = groups)
legend("topright", levels(groups), pch = 21, pt.bg = 1:4)

#Also in vegan (FRPP only)
adonis(formula = Y~sex*surv, method = "euclidean" )
## 
## Call:
## adonis(formula = Y ~ sex * surv, method = "euclidean") 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)    
## sex         1   0.09822 0.098219 10.0366 0.06854  0.001 ***
## surv        1   0.03995 0.039948  4.0821 0.02788  0.018 *  
## sex:surv    1   0.00305 0.003052  0.3118 0.00213  0.846    
## Residuals 132   1.29176 0.009786         0.90145           
## Total     135   1.43298                  1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pairwise comparisons 
PW <- pairwise(model2.rrpp,groups = SexBySurv, print.progress = FALSE)
summary(PW, test.type = "dist")  #No pairwise groups significant AFTER accounting for main effects!
## 
## Pairwise comparisons
## 
## Groups: f FALSE f TRUE m FALSE m TRUE 
## 
## RRPP: 1000 permutations
## 
## LS means:
## Vectors hidden (use show.vectors = TRUE to view)
## 
## Pairwise distances between means, plus statistics
##                          d  UCL (95%)          Z Pr > d
## f FALSE:f TRUE  0.03129167 0.06256436 -1.1053593  0.877
## f FALSE:m FALSE 0.05292134 0.08605751 -0.4842815  0.669
## f FALSE:m TRUE  0.05395092 0.08147123 -0.2851959  0.607
## f TRUE:m FALSE  0.07766871 0.10635991 -0.1143882  0.537
## f TRUE:m TRUE   0.06388601 0.08724825  0.3222765  0.388
## m FALSE:m TRUE  0.03848096 0.05488069 -0.2152641  0.600
model.null <- lm.rrpp(Y~1,data=mydat, print.progress = FALSE)
summary(pairwise(model2.rrpp, model.null, groups = SexBySurv, print.progress = FALSE),test.type = "dist")
## 
## Pairwise comparisons
## 
## Groups: f FALSE f TRUE m FALSE m TRUE 
## 
## RRPP: 1000 permutations
## 
## LS means:
## Vectors hidden (use show.vectors = TRUE to view)
## 
## Pairwise distances between means, plus statistics
##                          d  UCL (95%)         Z Pr > d
## f FALSE:f TRUE  0.03129167 0.04919726 0.5399611  0.282
## f FALSE:m FALSE 0.05292134 0.04254589 2.1786819  0.014
## f FALSE:m TRUE  0.05395092 0.04073769 2.3689910  0.007
## f TRUE:m FALSE  0.07766871 0.04720942 2.9792883  0.001
## f TRUE:m TRUE   0.06388601 0.04332194 2.6774899  0.004
## m FALSE:m TRUE  0.03848096 0.03849811 1.6970478  0.051
   #Male vs. female seem to be biggest differences

3: Multivariate Regression

Multivariate regression is accomplished in a similar manner, with the exception that the independent variable (X) is continuous.

It is a challenge to visualize the multivariate regression, as the Y-axis of this plot is multivariate and can be highly dimensional. Several visualization representations can be used to summarize trends in a multivariate-Y versus X. One approach from the RRPP package are illustrated here.

summary(manova(Y~TotalLength))
##              Df  Pillai approx F num Df den Df    Pr(>F)    
## TotalLength   1 0.55629   19.903      8    127 < 2.2e-16 ***
## Residuals   134                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.reg <- lm.rrpp(Y~TotalLength, data = mydat, print.progress = FALSE)
anova(model.reg)
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Ordinary Least Squares 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##              Df      SS      MS     Rsq      F      Z Pr(>F)   
## TotalLength   1 0.38116 0.38116 0.26599 48.559 5.6649  0.001 **
## Residuals   134 1.05182 0.00785 0.73401                        
## Total       135 1.43298                                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = Y ~ TotalLength, data = mydat, print.progress = FALSE)
### Visualizing multivariate regression 
plot(model.reg, type = "regression", reg.type = "RegScore", 
     predictor = mydat$TotalLength, pch=19)

4: MANCOVA

Multivariate analysis of covariance contains both continuous and categorical X-variables. Here one must first test for homogeneity in Y~X slopes across groups. If these are not statistically distinguishable, a common slope model may be used to compare group means while accounting for a common slope.

On the other hand, if the slopes are different, one must perform a pairwise comparison of slopes test to determine how the slopes differ (rather than the group mean test).

As with multivariate regression, visualizing these trends is challenging. RRPP has several options (derived from the literature): two of which are shown here.

#MANCOVA    
summary(manova(lm(Y~TotalLength*sex*surv)))
##                       Df  Pillai approx F num Df den Df    Pr(>F)    
## TotalLength            1 0.63862  26.7287      8    121 < 2.2e-16 ***
## sex                    1 0.41791  10.8590      8    121 1.924e-11 ***
## surv                   1 0.26227   5.3771      8    121 8.593e-06 ***
## TotalLength:sex        1 0.09667   1.6186      8    121    0.1263    
## TotalLength:surv       1 0.02795   0.4348      8    121    0.8981    
## sex:surv               1 0.09295   1.5499      8    121    0.1471    
## TotalLength:sex:surv   1 0.01526   0.2343      8    121    0.9838    
## Residuals            128                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova(lm(Y~TotalLength+sex*surv))) # FIT COMMON SLOPE
##              Df  Pillai approx F num Df den Df    Pr(>F)    
## TotalLength   1 0.62878  26.2545      8    124 < 2.2e-16 ***
## sex           1 0.40635  10.6098      8    124 2.859e-11 ***
## surv          1 0.26117   5.4791      8    124 6.315e-06 ***
## sex:surv      1 0.08125   1.3708      8    124    0.2157    
## Residuals   131                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#MANCOVA via RRPP
model.mancova <- lm.rrpp(Y~TotalLength*sex*surv, data =mydat, print.progress = FALSE)
anova(model.mancova)
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Ordinary Least Squares 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##                       Df      SS      MS     Rsq       F       Z Pr(>F)   
## TotalLength            1 0.38116 0.38116 0.26599 53.0290  5.8033  0.001 **
## sex                    1 0.03336 0.03336 0.02328  4.6406  2.4586  0.010 * 
## surv                   1 0.06762 0.06762 0.04719  9.4075  3.5580  0.001 **
## TotalLength:sex        1 0.00554 0.00554 0.00386  0.7702  0.0070  0.505   
## TotalLength:surv       1 0.00449 0.00449 0.00313  0.6246 -0.2631  0.596   
## sex:surv               1 0.01942 0.01942 0.01355  2.7015  1.7321  0.042 * 
## TotalLength:sex:surv   1 0.00137 0.00137 0.00096  0.1905 -1.8356  0.970   
## Residuals            128 0.92003 0.00719 0.64204                          
## Total                135 1.43298                                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = Y ~ TotalLength * sex * surv, data = mydat, print.progress = FALSE)
#Pairwise comparisons of slopes: NOTE: Used here for illustrative purposes only! 
   # Interaction should show evidence justifying this analysis
PW.mancova <- pairwise(model.mancova, groups = SexBySurv, covariate = TotalLength, print.progress = FALSE)
summary(PW.mancova, test.type = "VC", angle.type = "deg")
## 
## Pairwise comparisons
## 
## Groups: f FALSE f TRUE m FALSE m TRUE 
## 
## RRPP: 1000 permutations
## 
## Slopes (vectors of variate change per one unit of covariate 
##         change, by group):
## Vectors hidden (use show.vectors = TRUE to view)
## 
## Pairwise statistics based on slopes vector correlations (r) 
##           and angles, acos(r)
## The null hypothesis is that r = 1 (parallel vectors).
## This null hypothesis is better treated as the angle 
##           between vectors = 0
##                         r     angle UCL (95%)          Z Pr > angle
## f FALSE:f TRUE  0.9900078  8.106449  39.18516 -2.1209205      0.983
## f FALSE:m FALSE 0.9925400  7.002909  25.00201 -2.2438546      0.981
## f FALSE:m TRUE  0.9475137 18.645708  35.35471 -0.5622322      0.713
## f TRUE:m FALSE  0.9784336 11.920935  40.77940 -1.5571648      0.946
## f TRUE:m TRUE   0.9465036 18.825875  41.25513 -0.4911424      0.698
## m FALSE:m TRUE  0.9587392 16.516249  31.89484 -0.2167071      0.592
### Visualizing MANCOVA
plot(model.mancova, type = "regression", reg.type = "RegScore", 
     predictor = mydat$TotalLength, pch=19, col = as.numeric(groups))

plot(model.mancova, type = "regression", reg.type = "PredLine", 
     predictor = mydat$TotalLength, pch=19,
     col = as.numeric(groups))

4a: RRPP: A more complex example

Here we perform a series of analyses on a high-dimensional dataset, containing 54 specimens and 112 variables describing the body shapes of fish. Several RRPP analyses are shown, which remind us of the strength of these permutation-based methods.

First, a MANOVA via RRPP, including pairwise comparisons and graphical visualization of results.

##           Df       SS        MS     Rsq      F      Z Pr(>F)   
## Pop        1 0.008993 0.0089927 0.15964 16.076 3.9997  0.001 **
## Sex        1 0.015917 0.0159169 0.28255 28.453 4.1103  0.001 **
## Pop:Sex    1 0.003453 0.0034532 0.06130  6.173 3.7015  0.001 **
## Residuals 50 0.027970 0.0005594 0.49651                        
## Total     53 0.056333                                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Pairwise comparisons
## 
## Groups: F.Marsh M.Marsh F.Sinkhole M.Sinkhole 
## 
## RRPP: 1000 permutations
## 
## LS means:
## Vectors hidden (use show.vectors = TRUE to view)
## 
## Pairwise distances between means, plus statistics
##                                d  UCL (95%)          Z Pr > d
## F.Marsh:M.Marsh       0.04611590 0.04297378  2.3159417  0.007
## F.Marsh:F.Sinkhole    0.03302552 0.03312516  1.6124629  0.055
## F.Marsh:M.Sinkhole    0.03881514 0.04633821 -0.5421145  0.699
## M.Marsh:F.Sinkhole    0.04605211 0.05506197 -0.2523753  0.597
## M.Marsh:M.Sinkhole    0.02802087 0.03343901  0.1854026  0.420
## F.Sinkhole:M.Sinkhole 0.02568508 0.04364031 -2.2111968  0.993

Next we perform MANCOVA to include body size as a covariate. Here, pairwise comparisons are of multivariate slopes. Visualization of these high-dimensional allometry trends are found via approaches by Drake & Klingenberg (2008) and Adams & Nistri (2010).

##                 Df       SS        MS     Rsq       F      Z Pr(>F)   
## logSize          1 0.014019 0.0140193 0.24886 29.2078 5.0041  0.001 **
## Pop              1 0.010247 0.0102472 0.18190 21.3491 4.7949  0.001 **
## Sex              1 0.005028 0.0050284 0.08926 10.4763 4.5388  0.001 **
## logSize:Pop      1 0.001653 0.0016528 0.02934  3.4434 2.6259  0.004 **
## logSize:Sex      1 0.001450 0.0014498 0.02574  3.0206 2.2721  0.014 * 
## Pop:Sex          1 0.001370 0.0013696 0.02431  2.8534 2.2866  0.010 * 
## logSize:Pop:Sex  1 0.000486 0.0004865 0.00864  1.0135 0.3337  0.382   
## Residuals       46 0.022079 0.0004800 0.39194                         
## Total           53 0.056333                                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Pairwise comparisons
## 
## Groups: F.Marsh M.Marsh F.Sinkhole M.Sinkhole 
## 
## RRPP: 1000 permutations
## 
## Slopes (vectors of variate change per one unit of covariate 
##         change, by group):
## Vectors hidden (use show.vectors = TRUE to view)
## 
## Pairwise statistics based on slopes vector correlations (r) 
##           and angles, acos(r)
## The null hypothesis is that r = 1 (parallel vectors).
## This null hypothesis is better treated as the angle 
##           between vectors = 0
##                               r    angle UCL (95%)          Z Pr > angle
## F.Marsh:M.Marsh       0.6139591 52.12367  95.40553 -1.0051685      0.844
## F.Marsh:F.Sinkhole    0.6318267 50.81498  88.71850 -0.8268874      0.782
## F.Marsh:M.Sinkhole    0.4461018 63.50614 116.45215 -1.3459215      0.904
## M.Marsh:F.Sinkhole    0.7175129 44.15048  77.31288 -1.4269023      0.926
## M.Marsh:M.Sinkhole    0.5629975 55.73665  74.08894  0.5136053      0.316
## F.Sinkhole:M.Sinkhole 0.4627734 62.43378  81.64708  0.4261192      0.342

5: Multivariate LM with non-continuous data

If one does not have multivariate normal data (ie, not multivariate continuous), MANOVA via RRPP may still be utilized to evaluate such hypotheses. Here a distance matrix is used as the input Y-data. NOTE: one must use an appropriate distance measure for the data type under investigation (see lecture).

mydat$Ydist <- dist(Y)
mancova.dist <- lm.rrpp (Ydist~TotalLength*sex*surv, data =mydat, print.progress = FALSE)
anova(mancova.dist)
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Ordinary Least Squares 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##                       Df      SS      MS     Rsq       F       Z Pr(>F)   
## TotalLength            1 0.38116 0.38116 0.26599 53.0290  5.8033  0.001 **
## sex                    1 0.03336 0.03336 0.02328  4.6406  2.4586  0.010 * 
## surv                   1 0.06762 0.06762 0.04719  9.4075  3.5580  0.001 **
## TotalLength:sex        1 0.00554 0.00554 0.00386  0.7702  0.0070  0.505   
## TotalLength:surv       1 0.00449 0.00449 0.00313  0.6246 -0.2631  0.596   
## sex:surv               1 0.01942 0.01942 0.01355  2.7015  1.7321  0.042 * 
## TotalLength:sex:surv   1 0.00137 0.00137 0.00096  0.1905 -1.8356  0.970   
## Residuals            128 0.92003 0.00719 0.64204                          
## Total                135 1.43298                                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = Ydist ~ TotalLength * sex * surv, data = mydat,  
##     print.progress = FALSE)

5a: Species Abundance Data

A common scenario in community ecology is to have a data matrix comprised of species abundances across sites. Typically, each species is a column (variable), and the rows are sites. One goal is to determine whether community richness patterns differ across sites in a particular manner (e.g., islands vs. continents, or across some ecological gradient). Effectively, these are ANOVA and regression-type hypotheses, but the response data are in the form of counts, not continuous variables. There is a large literature on what to do in such cases, and several implementations for analysis are possible.

One school uses matrices of log(abundance+1) or Bray-Curtis distances analyzed using linear models (LM) via permutational MANOVA (e.g., Anderson).

Another approach is to use generalized linear models (GLM with poisson or other link-function) on each species separately, followed by evaluating \(\small\sum{F}\)-ratio found across species (e.g., Wharton).

There are strengths and weaknesses to both approaches, and both have advantages. GLM can be accommodating to specific data types but LM is more robust to departures from assumptions. Importantly, it is suggested that residual resampling (i.e., RRPP!) cures a lot of LM/GLM ills. Future theoretical work on RRPP will investigate this more thoroughly.

for discussion see: Wharton et al. 2016. Methods Ecol. Evol.

Here we show a simple example using each of these 3 options:

data("spider")  #species abundance data across localities
dim(spider$abund)
## [1] 28 12
# transformation of raw data (log+1)
spid <- data.frame(spider$x)
spid$abund <- as.matrix(spider$abund)
spid$logY <- as.matrix(log(spider$abund + 1)) # log-transform

### 1: MANOVA via RRPP on log(Y) 
fitLM <- lm.rrpp(logY ~ soil.dry + bare.sand + fallen.leaves + moss
                 + herb.layer + reflection, 
                 data = spid, iter = 999, SS.type = "II",
                 print.progress = FALSE)
anova(fitLM)
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Ordinary Least Squares 
## Sums of Squares and Cross-products: Type II 
## Effect sizes (Z) based on F distributions
## 
##               Df     SS     MS      Rsq      F       Z Pr(>F)   
## soil.dry       1  46.08 46.076 0.083788 6.8128 3.03685  0.001 **
## bare.sand      1  10.23 10.227 0.018597 1.5121 0.95185  0.166   
## fallen.leaves  1  29.44 29.435 0.053528 4.3523 2.29271  0.009 **
## moss           1  19.50 19.498 0.035457 2.8830 1.75908  0.042 * 
## herb.layer     1  13.05 13.053 0.023737 1.9301 1.27281  0.102   
## reflection     1  11.36 11.359 0.020656 1.6795 1.10126  0.135   
## Residuals     21 142.03  6.763 0.258270                         
## Total         27 549.91                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = logY ~ soil.dry + bare.sand + fallen.leaves + moss +  
##     herb.layer + reflection, iter = 999, SS.type = "II", data = spid,  
##     print.progress = FALSE)
pca <- plot(fitLM, type = "PC")
text(pca$PC.points, rownames(pca$PC.points), pos = 1, cex = 0.5)

### 2: MANOVA via RRPP on Bray-Curtis distance (ala Anderson)
Db <- vegdist(spid$abund, method="bray")

fitD <- lm.rrpp(Db ~ soil.dry + bare.sand + fallen.leaves + moss
                + herb.layer + reflection, 
                data = spid, iter = 999, SS.type = "II",
                print.progress = FALSE)
anova(fitD)  #pretty similar
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Ordinary Least Squares 
## Sums of Squares and Cross-products: Type II 
## Effect sizes (Z) based on F distributions
## 
##               Df      SS      MS     Rsq      F      Z Pr(>F)   
## soil.dry       1  0.8891 0.88908 0.07082 4.2743 3.5080  0.001 **
## bare.sand      1  0.4640 0.46404 0.03697 2.2309 2.1061  0.016 * 
## fallen.leaves  1  0.5586 0.55865 0.04450 2.6857 2.3120  0.008 **
## moss           1  0.5680 0.56800 0.04525 2.7307 2.2711  0.016 * 
## herb.layer     1  0.3315 0.33150 0.02641 1.5937 1.3121  0.100 . 
## reflection     1  0.2399 0.23994 0.01911 1.1535 0.5277  0.280   
## Residuals     21  4.3681 0.20801 0.34797                        
## Total         27 12.5532                                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = Db ~ soil.dry + bare.sand + fallen.leaves + moss +  
##     herb.layer + reflection, iter = 999, SS.type = "II", data = spid,  
##     print.progress = FALSE)
### 3: Generalized linear models: mvAbund (ala Wharton)
spiddat <- mvabund(spider$abund)
X <- spider$x

#To fit a log-linear model assuming counts are poisson:
  #NOTE: mvabund fits generlized LM to each Y-variable separately, then sums F-values
glm.spid <- manyglm(spiddat~X, family="poisson")
summary(glm.spid)   #not the same as above
## 
## Test statistics:
##                wald value Pr(>wald)    
## (Intercept)        17.941  0.000999 ***
## Xsoil.dry          20.142  0.000999 ***
## Xbare.sand          8.057  0.087912 .  
## Xfallen.leaves     11.414  0.003996 ** 
## Xmoss              13.831  0.002997 ** 
## Xherb.layer        17.671  0.000999 ***
## Xreflection        12.519  0.004995 ** 
## --- 
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Test statistic:  51.38, p-value: 0.000999 
## Arguments:
##  Test statistics calculated assuming response assumed to be uncorrelated 
##  P-value calculated using 1000 resampling iterations via pit.trap resampling (to account for correlation in testing).