Motivation

Today we explore ways in which quantitative analyses may be performed in light of phylogenetic relationshps. When species trait values are used as data, we encounter a non-independence issue: species are not independent because they exhibit a shared evolutionary history as described by their phylogeny. Thus, the expected covariation between taxa is not iid: \(\small\sim\mathcal{N}(0,1)\). Instead, it is \(\small\sim\mathcal{N}(0,\textbf{V})\) where \(\small\mathbf{V}\) is an expected covariance matrix described by the phylogeny and the evolutionary model under consideration.

Numerous analytical methods have been devoped to take phylogeny into account, which collectively are termed phylogenetic comparative methods (PCMs). Today we will examine several of the more commonly used PCMs ones.

Data Setup

Let’s first read in some data, match the data to the phylogeny, and prune both. NOTE: a critical issue in R is that the species names in the data matrix [names(Y) or rownames(Y) depending on input type] match those on the phylogeny [phy$tip.label]:

library(nlme)   # Contains GLS 
library(ape)    # Many phylogenetics-related analyses (see Paradis, Analysis of Phylogenetics and Evolution with R)
library(geiger) # Many evolutionary analyses
library(phytools) 
## Loading required package: maps
library(geomorph) #contains some multivariate PCMs
## Loading required package: RRPP
## Loading required package: rgl
## Loading required package: Matrix
library(RRPP)

## Read data, phylogeny, and match the two
tree.best<-read.nexus("Data/PlethodontidTree.nex") #Maximum Credible Tree
plethdata<-read.csv("Data/meandata-CinGlutOnly.csv",row.names=1, header=TRUE )
 size<-plethdata[,2];names(size)<-row.names(plethdata) 
 gp<-as.factor(plethdata[,1]); names(gp)<-row.names(plethdata)
# Prune tree to match taxa
 plethtree<-treedata(phy = tree.best,data = plethdata, warnings = FALSE)$phy
 plot(tree.best)

plot(plethtree)
axisPhylo(1) 

 #Relative body proportions: trait/size
 Y<-plethdata[,c(3:8)]
     Y<-apply(Y,2,as.numeric);row.names(Y)<-row.names(plethdata)
     TL = Y[,1]
     Y.sz<-Y/size
gdf <- rrpp.data.frame(Y=Y, Y.sz=Y.sz, TL = TL, size=size,gp=gp) 
gdf$Cov <- vcv.phylo(plethtree)

1: Phylogenetic regression/anova

One may perform models of the form Y~X while accounting for the phylogeny. This is accomplished using generalized least squares (GLS), which is can be implemented as a weighted regression. There are several ways to accomplish this in R, but first let’s examine a non-phylogenetic analysis

anova(lm(TL~size))  
## Analysis of Variance Table
## 
## Response: TL
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## size       1 3707.9  3707.9  116.18 1.149e-12 ***
## Residuals 35 1117.0    31.9                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(TL~size))
## 
## Call:
## lm(formula = TL ~ size)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5004  -3.0022  -0.5384   3.7960  12.4688 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.29923    5.55078   0.234    0.816    
## size         0.96815    0.08982  10.779 1.15e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.649 on 35 degrees of freedom
## Multiple R-squared:  0.7685, Adjusted R-squared:  0.7619 
## F-statistic: 116.2 on 1 and 35 DF,  p-value: 1.149e-12
plot(TL~size)
abline(lm(TL~size))

summary(gls(TL~size, data=data.frame(TL=TL,size=size)))
## Generalized least squares fit by REML
##   Model: TL ~ size 
##   Data: data.frame(TL = TL, size = size) 
##        AIC      BIC    logLik
##   238.4264 243.0924 -116.2132
## 
## Coefficients:
##                 Value Std.Error   t-value p-value
## (Intercept) 1.2992281  5.550775  0.234062  0.8163
## size        0.9681511  0.089820 10.778810  0.0000
## 
##  Correlation: 
##      (Intr)
## size -0.986
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.38975579 -0.53143002 -0.09531055  0.67195204  2.20715419 
## 
## Residual standard error: 5.649265 
## Degrees of freedom: 37 total; 35 residual

Now let’s take phylogeny into consideration:

  #using GLS
df <- data.frame(TL=TL,size=size, species = names(TL), gp=gp)
bm.gls<-gls(TL~size, correlation=corBrownian(phy=plethtree, form = ~species), data= df)
summary(bm.gls)  #Here the correlation structure of the phylogeny is used
## Generalized least squares fit by REML
##   Model: TL ~ size 
##   Data: df 
##        AIC      BIC    logLik
##   282.1642 286.8303 -138.0821
## 
## Correlation Structure: corBrownian
##  Formula: ~species 
##  Parameter estimate(s):
## numeric(0)
## 
## Coefficients:
##                Value Std.Error  t-value p-value
## (Intercept) 8.421406 14.684714 0.573481  0.5700
## size        0.830722  0.194514 4.270769  0.0001
## 
##  Correlation: 
##      (Intr)
## size -0.756
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -0.66113579 -0.09349179  0.04133058  0.26834434  0.71613862 
## 
## Residual standard error: 19.1745 
## Degrees of freedom: 37 total; 35 residual
anova(bm.gls)
## Denom. DF: 35 
##             numDF  F-value p-value
## (Intercept)     1 33.79994  <.0001
## size            1 18.23947   1e-04
#using RRPP: same
pgls.res<-lm.rrpp(TL~size,data=gdf, Cov = gdf$Cov,  print.progress = FALSE) 
anova(pgls.res)
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Generalized Least-Squares (via OLS projection) 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##           Df     SS      MS     Rsq      F      Z Pr(>F)   
## size       1 189.34 189.343 0.31689 18.239 3.0164  0.001 **
## Residuals 35 363.33  10.381 0.60809                        
## Total     36 597.50                                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = TL ~ size, data = gdf, Cov = gdf$Cov, print.progress = FALSE)
pgls.res$LM$gls.coefficients
## (Intercept)        size 
##   8.4214060   0.8307223
#using Independent Contrasts: same
picTL<-pic(TL, plethtree)
picSz<-pic(size, plethtree)
cor.test(picTL, picSz)
## 
##  Pearson's product-moment correlation
## 
## data:  picTL and picSz
## t = 4.3666, df = 34, p-value = 0.0001119
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3373074 0.7752775
## sample estimates:
##       cor 
## 0.5994172
summary(lm(picTL~picSz - 1)) #Contrasts run through origin: see Garland et al. 1992
## 
## Call:
## lm(formula = picTL ~ picSz - 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6308 -1.1430 -0.0204  1.2525 13.0953 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)    
## picSz   0.8307     0.1945   4.271 0.000142 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.222 on 35 degrees of freedom
## Multiple R-squared:  0.3426, Adjusted R-squared:  0.3238 
## F-statistic: 18.24 on 1 and 35 DF,  p-value: 0.0001417
plot(picTL~picSz)
abline(lm(picTL~picSz - 1))

The approach is also appropriate for anova-models, and for multivariate data (using D-PGLS):

  #Phylogenetic anova
anova(lm.rrpp(TL ~ gp, Cov = gdf$Cov, data = gdf, iter = 999, print.progress = FALSE) )
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Generalized Least-Squares (via OLS projection) 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##           Df     SS     MS     Rsq      F       Z Pr(>F)
## gp         1  17.67 17.669 0.02957 1.1559 0.58019  0.295
## Residuals 35 535.01 15.286 0.89540                      
## Total     36 597.50                                     
## 
## Call: lm.rrpp(f1 = TL ~ gp, iter = 999, data = gdf, Cov = gdf$Cov,  
##     print.progress = FALSE)
anova(gls(TL~gp, correlation=corBrownian(phy=plethtree, form = ~species), data= df))  #same
## Denom. DF: 35 
##             numDF   F-value p-value
## (Intercept)     1 22.954161  <.0001
## gp              1  1.155905  0.2897
  #multivariate phy-anova/regression (even when p>N)
PGLS.reg<-lm.rrpp(Y ~ size, Cov = gdf$Cov, data = gdf, iter = 999, print.progress = FALSE) 
anova(PGLS.reg)
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Generalized Least-Squares (via OLS projection) 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##           Df     SS      MS     Rsq      F      Z Pr(>F)   
## size       1 214.21 214.214 0.35852 19.561 3.0528  0.001 **
## Residuals 35 383.29  10.951 0.64148                        
## Total     36 597.50                                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call: lm.rrpp(f1 = Y ~ size, iter = 999, data = gdf, Cov = gdf$Cov,  
##     print.progress = FALSE)
plot(PGLS.reg, type = "regression", predictor=size,reg.type = "RegScore")

PGLS.aov<-lm.rrpp(Y ~ gp, Cov = gdf$Cov, data = gdf, iter = 999, print.progress = FALSE) 
anova(PGLS.aov)  
## 
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals 
## Number of permutations: 1000 
## Estimation method: Generalized Least-Squares (via OLS projection) 
## Sums of Squares and Cross-products: Type I 
## Effect sizes (Z) based on F distributions
## 
##           Df     SS     MS     Rsq      F       Z Pr(>F)
## gp         1  23.44 23.435 0.03922 1.4288 0.77209  0.236
## Residuals 35 574.07 16.402 0.96078                      
## Total     36 597.50                                     
## 
## Call: lm.rrpp(f1 = Y ~ gp, iter = 999, data = gdf, Cov = gdf$Cov, print.progress = FALSE)

2: Phylogenetic PLS (partial least squares)

One may also account for phylogeny when performing PLS, the multivariate association between two data matrices.

PLS.Y <- phylo.integration(A = Y[,1:3], A2 = Y[,4:6], phy= plethtree, print.progress = FALSE)
summary(PLS.Y)
## 
## Call:
## phylo.integration(A = Y[, 1:3], A2 = Y[, 4:6], phy = plethtree,  
##     print.progress = FALSE) 
## 
## 
## 
## r-PLS: 0.639
## 
## Effect Size (Z): 3.3592
## 
## P-value: 0.001
## 
## Based on 1000 random permutations
plot(PLS.Y)

3: Phylogenetic Ordination

A PCA with the phylogeny superimposed is a phylomorphospace. There are numerous functions for accomplishing this (e.g., in phytools): one we will illustrate is in geomorph. This function, gmprcomp, allows one to implement a phylomorphospace approach, a phylogenetic-PCA, or PACA (phylogenetically-aligned component analysis):

rownames(Y.sz) <- 1:nrow(Y.sz)
plethtree2 <- plethtree; plethtree2$tip.label <- rownames(Y.sz)
PCA.w.phylo <- gm.prcomp(Y.sz, phy = plethtree2) #: phylomorphospace
phylo.PCA <- gm.prcomp(Y.sz, phy = plethtree2, GLS = TRUE) #phylo-PCA
PaCA.gls <- gm.prcomp(Y.sz, phy = plethtree2, 
                      align.to.phy = TRUE, GLS = TRUE) # PACA

par(mfrow=c(2,2))
plot(PCA.w.phylo, phylo = TRUE, main = "PCA.w.phylo", 
     phylo.par = list(node.labels = FALSE))
plot(phylo.PCA, phylo = TRUE, main = "phylo PCA",
     phylo.par = list(node.labels = FALSE))
plot(PaCA.gls, phylo = TRUE, main = "PaCA",
     phylo.par = list(node.labels = FALSE))
par(mfrow=c(1,1))

4: Phylogenetic Signal

Phylogenetic signal is the degree to which closely related species display similar trait values. In R this may be implemented in picante, phytools, and geomorph. Multivariate phylogenetic signal is found in geomorph. Here we illustrate this in both packages.

phylosig(plethtree, TL, method="K", test=T, nsim=1000)  #phytools
## 
## Phylogenetic signal K : 0.368483 
## P-value (based on 1000 randomizations) : 0.041
physignal(gdf$TL,plethtree, print.progress = FALSE)   #geomorph
## 
## Call:
## physignal(A = gdf$TL, phy = plethtree, print.progress = FALSE) 
## 
## 
## 
## Observed Phylogenetic Signal (K): 0.3685
## 
## P-value: 0.033
## 
## Effect Size: 1.8148
## 
## Based on 1000 random permutations
#Multivariate
PS.shape <- physignal(A=Y,phy=plethtree,iter=999, print.progress = FALSE)
summary(PS.shape)
## 
## Call:
## physignal(A = Y, phy = plethtree, iter = 999, print.progress = FALSE) 
## 
## 
## 
## Observed Phylogenetic Signal (K): 0.4342
## 
## P-value: 0.002
## 
## Effect Size: 2.6861
## 
## Based on 1000 random permutations
plot(PS.shape)

5: Comparing Phylogenetic Evolutionary Rates

Comparing the rate of phenotypic evolution is sometimes of interest. Typically, there are two approaches: comparing rates among clades (e.g., does body size evolve faster in island taxa versus continental species?), and comparing rates among traits (e.g., does size evolve faster than shape?). The general approach is to fit the data to the phylogeny under a model with a single evolutionary rate for all taxa, and compare this fit to a model with multiple rates for groups of taxa (BM1 vs. BMM). This may be implemented in several packages:

#5a: Compare Evolutionary Rates Among Clades
ER<-compare.evol.rates(A=Y, phy=plethtree,gp=gp,iter=999, print.progress = FALSE)
summary(ER)   #significantly higher rate of morphological evolution 'large' Plethodon
## 
## Call:
## 
## 
## Observed Rate Ratio: 3.6913
## 
## P-value: 0.027
## 
## Effect Size: 1.7194
## 
## Based on 1000 random permutations
## 
## The rate for group Large is 3.27167050803742  
## 
## The rate for group Small is 0.886328591621751
plot(ER)

#5b: Compare Evolutionary Rates Among Traits (multivariate)
var.gp <- c("B","A","A","B","B","B")  #head (A) vs. body (B)
EMR<-compare.multi.evol.rates(A=Y,gp=var.gp, Subset=TRUE, phy= plethtree,iter=999, print.progress = FALSE)
summary(EMR) #Limb traits evolve faster than head traits
## 
## Call:
## 
## 
## Observed Rate Ratio: 30.8266
## 
## P-value: 0.001
## 
## Effect Size: 9.8832
## 
## Based on 1000 random permutations
## 
## The rate for group A is 0.128874014550266  
## 
## The rate for group B is 3.9727412987915
plot(EMR)

6: Comparing Evolutionary Models (Evolutionary ‘Modes’)

The previous section allowed a comparison to BM1 and BMM evolutionary models. However, there is a much broader class of models that could be considered, including Ornstein-Uhlenbeck models, early-burst models, etc. OU models can have one or more optima (OU1, OUM), and more complicated models can be envisioned.

As with Brownian motion rate models, the procedure is straightforward: the fit of the data to the phylogeny under a specified evolutionary model is obtained, along with the corresponding logL and AIC. Next, these are used to compare the fit of models to determine which has the highest support. A simple example is below (NOTE: The package OUCH, MVSLOUCH, OUwie, and others can compare more complicated models: BM1, BMM, OU1, OUM, etc.).

NOTE: only univariate examples are shown here, as multivariate OU methods do not currently work as intended.

geo=get(data(geospiza))      #a smaller dataset (for time reasons)
tmp=treedata(geo$phy, geo$dat)
## Warning in treedata(geo$phy, geo$dat): The following tips were not found in 'data' and were dropped from 'phy':
##  olivacea
phy=tmp$phy; dat=tmp$data[,"tarsusL"] 
plot(phy)

bm.M<-fitContinuous(phy, dat, model="BM")
ou.M<-fitContinuous(phy, dat, bounds = list(alpha = c(min = exp(-500), max = exp(2))),model="OU")
  bm.M
## GEIGER-fitted comparative model of continuous data
##  fitted 'BM' model parameters:
##  sigsq = 0.049339
##  z0 = 3.020419
## 
##  model summary:
##  log-likelihood = 10.567317
##  AIC = -17.134634
##  AICc = -15.934634
##  free parameters = 2
## 
## Convergence diagnostics:
##  optimization iterations = 100
##  failed iterations = 0
##  number of iterations with same best fit = 100
##  frequency of best fit = 1.00
## 
##  object summary:
##  'lik' -- likelihood function
##  'bnd' -- bounds for likelihood search
##  'res' -- optimization iteration summary
##  'opt' -- maximum likelihood parameter estimates
  ou.M
## GEIGER-fitted comparative model of continuous data
##  fitted 'OU' model parameters:
##  alpha = 3.488735
##  sigsq = 0.098524
##  z0 = 3.010718
## 
##  model summary:
##  log-likelihood = 11.401555
##  AIC = -16.803111
##  AICc = -14.136444
##  free parameters = 3
## 
## Convergence diagnostics:
##  optimization iterations = 100
##  failed iterations = 0
##  number of iterations with same best fit = 51
##  frequency of best fit = 0.51
## 
##  object summary:
##  'lik' -- likelihood function
##  'bnd' -- bounds for likelihood search
##  'res' -- optimization iteration summary
##  'opt' -- maximum likelihood parameter estimates
# Compare models: LRT & AIC
LRT.M<-(2*(ou.M$opt$lnL-bm.M$opt$lnL))
prob.M<-pchisq(LRT.M, 1, lower.tail=FALSE)
LRT.M
## [1] 1.668477
prob.M
## [1] 0.1964627
bm.M$opt$aic
## [1] -17.13463
ou.M$opt$aic #OU does not provide a better fit: use simpler model (BM)
## [1] -16.80311