Goals

Three goals for this exercise

Build likelihood function

We already know the likelihood function for Brownian motion on a tree.

\[ L(\mathbf{x} | \sigma^2, \theta, phy) = mvtnorm(mean=\mathbf{\theta}, variance=\sigma^2 * \mathbf{C}) \] Let’s write this equation as an R function.

# sloppy function will be fine for now
# x is two-element vector with c(sig2, theta)
# assume order of phy$tip.label is the same as rows of data
lnL <- function(x, phy, data) {
  # obtain C matrix
  C <- vcv(phy)
  
  # multiply by sig2 to get the real variance-covariance matrix
  sig2 <- x[1]
  V <- sig2 * C
  
  # make a vector of thetas
  n <- length(phy$tip.label)
  theta <- x[2]
  th <- rep(theta, n)
  
  # calculate the likelihood
  res <- dmvnorm(data, th, V, log=TRUE) # return log-likelihood
  return(res)
  
}

# let's try this with an example
# simulate a tree
exampleTree <- rphylo(100, birth=0.1, death=0.05)
# simulate data on the tree
exampleData <- fastBM(exampleTree, 1.0) # theta = 0 sig2 = 1: true model

lnL(x=c(1, 1), exampleTree, exampleData)
## [1] -258.1389

build the MCMC

set priors

We can create functions for our priors. Make two functions for each parameter, one that calculates the probability density (p-) and one that draws from the prior (r-).

Priors: \(\sigma^2 \sim U(0, 2)\), \(\theta \sim U(-3, 3)\)

psig2prior <- function(x) {
  if(x > 0 & x < 2) return(log(0.5))
  else return(-Inf)
}
rsig2prior <- function() {
  res<-runif(1, min=0, max=2)
  return(res)
}


pthetaprior <- function(x) {
  if(x > -3 & x < 3) return(log(1/6))
  else return(-Inf)
}
rthetaprior <- function() {
  res<-runif(1, min=-3, max=3)
  return(res)
}

# for example
psig2prior(0.001) # returns prob dist function
## [1] -0.6931472
rsig2prior() # random draw from the prior
## [1] 1.734603
# to start the mcmc we need starting values for our parameters
# we draw from the prior
starting_sig2 <- rsig2prior()
starting_theta <- rthetaprior()

MCM loop

This will have four pieces: (1) propose new parameter values, (2) calculate three key ratios necessary for mcmc, (3) calculate acceptance probability based on ratios, (4) accept or reject. Do this lots of times.

current_sig2 <- starting_sig2
current_theta <- starting_theta

wp <- 0.05 # proposal width

myMCMC <- function(ngens) {
  res <- matrix(nrow=ngens, ncol=4)
  colnames(res) <- c("Generation", "lnL", "sig2", "theta")
  
  for(i in 1:ngens) {
    # propose new parameter values
    proposed_sig2 <- current_sig2 + runif(1, min=-wp/2, max=wp/2)
    proposed_theta <- current_theta + runif(1, min=-wp/2, max=wp/2)

    
    # calulate three ratios
    # prior odds, proposal density, likelihood ratios
    # all on a ln-scale
    lnprior_odds_ratio <- psig2prior(proposed_sig2) - psig2prior(current_sig2) + pthetaprior(proposed_theta) - pthetaprior(current_theta)
    
    lnproposal_density_ratio <- 0
    
    lnlikelihood_ratio <- lnL(x=c(proposed_sig2, proposed_theta), exampleTree, exampleData) - lnL(x=c(current_sig2, current_theta), exampleTree, exampleData)
    
    # calculate acceptance probability
    lnacceptance_ratio <- lnprior_odds_ratio + lnproposal_density_ratio + lnlikelihood_ratio
    acceptance_ratio <- exp(lnacceptance_ratio)
    
    if(runif(1)<acceptance_ratio) { # accept the proposal
      current_sig2 <- proposed_sig2
      current_theta <- proposed_theta
      
    }
    
    thisGenlnL <- lnL(x=c(current_sig2, current_theta), exampleTree, exampleData)
    res[i,] <- c(i, thisGenlnL, current_sig2, current_theta)
    
    if(i %% 1000 == 0) cat(i, " generation done\n")
      
  }
  return(res)
  
}

chain1<-myMCMC(10000)
## 1000  generation done
## 2000  generation done
## 3000  generation done
## 4000  generation done
## 5000  generation done
## 6000  generation done
## 7000  generation done
## 8000  generation done
## 9000  generation done
## 10000  generation done
head(chain1)
##      Generation       lnL      sig2    theta
## [1,]          1 -259.3285 0.8329478 1.541378
## [2,]          2 -259.0458 0.8553222 1.537112
## [3,]          3 -258.9115 0.8675630 1.534231
## [4,]          4 -258.7307 0.8866436 1.540727
## [5,]          5 -258.5944 0.9034536 1.532337
## [6,]          6 -258.6997 0.8903906 1.548417
plot(chain1[,1:2], type="l")

plot(chain1[,1:2], type="l", ylim=c(-281, -270))

hist(chain1[,3])

Convergence diagnostics using CODA

require(coda)
# this is dumb because i am not dealing w burnin and thinning
chain1_coda<-mcmc(chain1[,3:4], start=1, thin=1)
summary(chain1_coda)
## 
## Iterations = 1:10000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##        Mean     SD Naive SE Time-series SE
## sig2  1.102 0.1668 0.001668        0.03789
## theta 1.603 0.6287 0.006287        0.41155
## 
## 2. Quantiles for each variable:
## 
##         2.5%    25%   50%   75% 97.5%
## sig2  0.8366 0.9743 1.075 1.210 1.454
## theta 0.6810 1.1711 1.409 2.206 2.848
plot(chain1_coda)

autocorr(chain1_coda)
## , , sig2
## 
##             sig2      theta
## Lag 0  1.0000000 -0.1478395
## Lag 1  0.9962858 -0.1481387
## Lag 5  0.9810926 -0.1493208
## Lag 10 0.9616218 -0.1512214
## Lag 50 0.8295004 -0.1670943
## 
## , , theta
## 
##              sig2     theta
## Lag 0  -0.1478395 1.0000000
## Lag 1  -0.1475111 0.9995334
## Lag 5  -0.1463376 0.9977006
## Lag 10 -0.1450124 0.9954642
## Lag 50 -0.1314315 0.9775722
rejectionRate(chain1_coda)
##       sig2      theta 
## 0.03350335 0.03350335
# one possible "solution" is to take out burnin and thin the chain
noBurnin <- chain1[-(1:2000),]
# this is a thinning trick in R = sample 1 of every n - here n = 100
thinnedResults <- noBurnin[seq(1, nrow(noBurnin), 100),]
thinned_coda <- mcmc(thinnedResults[,3:4], start=2001, thin=100)

summary(thinned_coda)
## 
## Iterations = 2001:9901
## Thinning interval = 100 
## Number of chains = 1 
## Sample size per chain = 80 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##        Mean     SD Naive SE Time-series SE
## sig2  1.108 0.1677  0.01875        0.04805
## theta 1.647 0.6812  0.07617        0.57745
## 
## 2. Quantiles for each variable:
## 
##         2.5%    25%   50%   75% 97.5%
## sig2  0.8519 0.9744 1.056 1.239 1.416
## theta 0.5282 1.1673 1.440 2.300 2.846
effectiveSize(thinned_coda)
##      sig2     theta 
## 12.179800  1.391807

Let’s try to fix our acceptance rate as a first step

The thing I am most suspicious of at first is ALWAYS the proposal window.

Here we changed both parameters at once (risky) and used the same proposal width for both (pretty dumb).

Let’s modify our function.

wp_sig2 <- 0.2
wp_theta <- 0.2
myMCMC <- function(ngens) {
  res <- matrix(nrow=ngens, ncol=4)
  colnames(res) <- c("Generation", "lnL", "sig2", "theta")

  sig2_accept <- 0
  theta_accept <- 0
  
  for(i in 1:ngens) {
    # propose new parameter values
    # component-wise sampling
    if(i %% 2 != 0) {
      proposed_sig2 <- current_sig2 + runif(1, min=-wp_sig2/2, max=wp_sig2/2)
      proposed_theta <- current_theta
    } else {
      proposed_theta <- current_theta + runif(1, min=-wp_theta/2, max=wp_theta/2)
      proposed_sig2 <- current_sig2
    }
    
    # calulate three ratios
    # prior odds, proposal density, likelihood ratios
    # all on a ln-scale
    lnprior_odds_ratio <- psig2prior(proposed_sig2) - psig2prior(current_sig2) + pthetaprior(proposed_theta) - pthetaprior(current_theta)
    
    lnproposal_density_ratio <- 0
    
    lnlikelihood_ratio <- lnL(x=c(proposed_sig2, proposed_theta), exampleTree, exampleData) - lnL(x=c(current_sig2, current_theta), exampleTree, exampleData)
    
    # calculate acceptance probability
    lnacceptance_ratio <- lnprior_odds_ratio + lnproposal_density_ratio + lnlikelihood_ratio
    acceptance_ratio <- exp(lnacceptance_ratio)
    
    if(runif(1)<acceptance_ratio) { # accept the proposal
      current_sig2 <- proposed_sig2
      current_theta <- proposed_theta
      if(i %% 2 != 0) {
        sig2_accept <- sig2_accept + 1
      } else {
        theta_accept <- theta_accept + 1
      }
    }
    
    thisGenlnL <- lnL(x=c(current_sig2, current_theta), exampleTree, exampleData)
    res[i,] <- c(i, thisGenlnL, current_sig2, current_theta)
    
    if(i %% 1000 == 0) cat(i, " generation done\n")
      
  }
  cat("sig2 accepted: ", sig2_accept," out of ", i/2, ", ratio: ", sig2_accept/(i/2), "\n")
  cat("theta accepted: ", theta_accept," out of ", i/2, ", ratio: ", theta_accept/(i/2), "\n")

  return(res)
  
}


wp_sig2 <- 0.5
wp_theta <- 4
try1<-myMCMC(2000)
## 1000  generation done
## 2000  generation done
## sig2 accepted:  677  out of  1000 , ratio:  0.677 
## theta accepted:  822  out of  1000 , ratio:  0.822
chain2<- myMCMC(10000)
## 1000  generation done
## 2000  generation done
## 3000  generation done
## 4000  generation done
## 5000  generation done
## 6000  generation done
## 7000  generation done
## 8000  generation done
## 9000  generation done
## 10000  generation done
## sig2 accepted:  3451  out of  5000 , ratio:  0.6902 
## theta accepted:  4102  out of  5000 , ratio:  0.8204
noBurnin <- chain2[-(1:2000),]
thinnedResults <- noBurnin[seq(1, nrow(noBurnin), 100),]
thinned_coda_chain2 <- mcmc(thinnedResults[,3:4], start=2001, thin=100)

summary(thinned_coda_chain2)
## 
## Iterations = 2001:9901
## Thinning interval = 100 
## Number of chains = 1 
## Sample size per chain = 80 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##         Mean     SD Naive SE Time-series SE
## sig2  1.0596 0.1664   0.0186         0.0186
## theta 0.1411 1.6240   0.1816         0.1816
## 
## 2. Quantiles for each variable:
## 
##          2.5%     25%     50%   75% 97.5%
## sig2   0.8084  0.9313 1.03936 1.174 1.430
## theta -2.6273 -1.1205 0.03353 1.512 2.905
effectiveSize(thinned_coda_chain2)
##  sig2 theta 
##    80    80
chain3<- myMCMC(10000)
## 1000  generation done
## 2000  generation done
## 3000  generation done
## 4000  generation done
## 5000  generation done
## 6000  generation done
## 7000  generation done
## 8000  generation done
## 9000  generation done
## 10000  generation done
## sig2 accepted:  3383  out of  5000 , ratio:  0.6766 
## theta accepted:  4104  out of  5000 , ratio:  0.8208
noBurnin <- chain3[-(1:2000),]
thinnedResults <- noBurnin[seq(1, nrow(noBurnin), 100),]
thinned_coda_chain3 <- mcmc(thinnedResults[,3:4], start=2001, thin=100)

combinedRuns <- mcmc.list(thinned_coda_chain2, thinned_coda_chain3)
plot(combinedRuns)

# this should be very very close to one. like, less than 1.001
gelman.diag(combinedRuns)
## Potential scale reduction factors:
## 
##       Point est. Upper C.I.
## sig2       1.035        1.1
## theta      0.992        1.0
## 
## Multivariate psrf
## 
## 1.01

how can i speed up my code?

We will profile the code to see what is going on inside the computer!

# "turn on profiling"
tmp<-tempfile()
Rprof(tmp, interval=0.1)
# "do a thing"
test <- myMCMC(10000)
## 1000  generation done
## 2000  generation done
## 3000  generation done
## 4000  generation done
## 5000  generation done
## 6000  generation done
## 7000  generation done
## 8000  generation done
## 9000  generation done
## 10000  generation done
## sig2 accepted:  3415  out of  5000 , ratio:  0.683 
## theta accepted:  4108  out of  5000 , ratio:  0.8216
# "turn off profiling"
Rprof(NULL)

# report results
summaryRprof(tmp)
## $by.self
##                          self.time self.pct total.time total.pct
## "prop.part"                    6.3    17.36        6.9     19.01
## "all.equal.numeric"            6.2    17.08       10.5     28.93
## "vcv.phylo"                    4.6    12.67       14.5     39.94
## "chol.default"                 4.6    12.67        5.3     14.60
## "[["                           1.6     4.41        1.7      4.68
## "all.equal"                    1.4     3.86       11.9     32.78
## "is.na"                        1.4     3.86        1.4      3.86
## "as.matrix"                    0.9     2.48        0.9      2.48
## "unique"                       0.8     2.20        1.0      2.75
## "as.vector"                    0.8     2.20        0.8      2.20
## ".reorder_ape"                 0.7     1.93        0.8      2.20
## "t.default"                    0.7     1.93        0.7      1.93
## "lnL"                          0.4     1.10       35.7     98.35
## "diag"                         0.4     1.10        1.1      3.03
## "backsolve"                    0.4     1.10        0.7      1.93
## "Cj"                           0.4     1.10        0.4      1.10
## "matrix"                       0.4     1.10        0.4      1.10
## "myMCMC"                       0.3     0.83       36.3    100.00
## "isSymmetric.matrix"           0.3     0.83       13.0     35.81
## "data.class"                   0.3     0.83        1.8      4.96
## "tryCatch"                     0.2     0.55        5.6     15.43
## "t"                            0.2     0.55        0.9      2.48
## "sapply"                       0.2     0.55        0.7      1.93
## "simplify2array"               0.2     0.55        0.5      1.38
## "$<-"                          0.2     0.55        0.2      0.55
## "all"                          0.2     0.55        0.2      0.55
## "colSums"                      0.2     0.55        0.2      0.55
## "mode"                         0.2     0.55        0.2      0.55
## "psig2prior"                   0.2     0.55        0.2      0.55
## "isSymmetric"                  0.1     0.28       13.1     36.09
## "doTryCatch"                   0.1     0.28        5.4     14.88
## "reorder"                      0.1     0.28        1.0      2.75
## "reorder.phylo"                0.1     0.28        0.9      2.48
## "unique.default"               0.1     0.28        0.2      0.55
## "$"                            0.1     0.28        0.1      0.28
## ".C"                           0.1     0.28        0.1      0.28
## ".getTreesFromDotdotdot"       0.1     0.28        0.1      0.28
## ".uncompressTipLabel"          0.1     0.28        0.1      0.28
## "[[.multiPhylo"                0.1     0.28        0.1      0.28
## "any"                          0.1     0.28        0.1      0.28
## "is.factor"                    0.1     0.28        0.1      0.28
## "isTRUE"                       0.1     0.28        0.1      0.28
## "ncol"                         0.1     0.28        0.1      0.28
## "numeric"                      0.1     0.28        0.1      0.28
## "pthetaprior"                  0.1     0.28        0.1      0.28
## 
## $by.total
##                          total.time total.pct self.time self.pct
## "myMCMC"                       36.3    100.00       0.3     0.83
## "block_exec"                   36.3    100.00       0.0     0.00
## "call_block"                   36.3    100.00       0.0     0.00
## "eval"                         36.3    100.00       0.0     0.00
## "evaluate"                     36.3    100.00       0.0     0.00
## "evaluate_call"                36.3    100.00       0.0     0.00
## "handle"                       36.3    100.00       0.0     0.00
## "in_dir"                       36.3    100.00       0.0     0.00
## "knitr::knit"                  36.3    100.00       0.0     0.00
## "process_file"                 36.3    100.00       0.0     0.00
## "process_group"                36.3    100.00       0.0     0.00
## "process_group.block"          36.3    100.00       0.0     0.00
## "rmarkdown::render"            36.3    100.00       0.0     0.00
## "timing_fn"                    36.3    100.00       0.0     0.00
## "withCallingHandlers"          36.3    100.00       0.0     0.00
## "withVisible"                  36.3    100.00       0.0     0.00
## "lnL"                          35.7     98.35       0.4     1.10
## "dmvnorm"                      20.8     57.30       0.0     0.00
## "vcv.phylo"                    14.5     39.94       4.6    12.67
## "vcv"                          14.5     39.94       0.0     0.00
## "isSymmetric"                  13.1     36.09       0.1     0.28
## "isSymmetric.matrix"           13.0     35.81       0.3     0.83
## "all.equal"                    11.9     32.78       1.4     3.86
## "all.equal.numeric"            10.5     28.93       6.2    17.08
## "prop.part"                     6.9     19.01       6.3    17.36
## "tryCatch"                      5.6     15.43       0.2     0.55
## "doTryCatch"                    5.4     14.88       0.1     0.28
## "tryCatchList"                  5.4     14.88       0.0     0.00
## "tryCatchOne"                   5.4     14.88       0.0     0.00
## "chol.default"                  5.3     14.60       4.6    12.67
## "chol"                          5.3     14.60       0.0     0.00
## "data.class"                    1.8      4.96       0.3     0.83
## "[["                            1.7      4.68       1.6     4.41
## "is.na"                         1.4      3.86       1.4     3.86
## "diag"                          1.1      3.03       0.4     1.10
## "unique"                        1.0      2.75       0.8     2.20
## "reorder"                       1.0      2.75       0.1     0.28
## "as.matrix"                     0.9      2.48       0.9     2.48
## "t"                             0.9      2.48       0.2     0.55
## "reorder.phylo"                 0.9      2.48       0.1     0.28
## "as.vector"                     0.8      2.20       0.8     2.20
## ".reorder_ape"                  0.8      2.20       0.7     1.93
## "t.default"                     0.7      1.93       0.7     1.93
## "backsolve"                     0.7      1.93       0.4     1.10
## "sapply"                        0.7      1.93       0.2     0.55
## "simplify2array"                0.5      1.38       0.2     0.55
## "Cj"                            0.4      1.10       0.4     1.10
## "matrix"                        0.4      1.10       0.4     1.10
## "$<-"                           0.2      0.55       0.2     0.55
## "all"                           0.2      0.55       0.2     0.55
## "colSums"                       0.2      0.55       0.2     0.55
## "mode"                          0.2      0.55       0.2     0.55
## "psig2prior"                    0.2      0.55       0.2     0.55
## "unique.default"                0.2      0.55       0.1     0.28
## "$"                             0.1      0.28       0.1     0.28
## ".C"                            0.1      0.28       0.1     0.28
## ".getTreesFromDotdotdot"        0.1      0.28       0.1     0.28
## ".uncompressTipLabel"           0.1      0.28       0.1     0.28
## "[[.multiPhylo"                 0.1      0.28       0.1     0.28
## "any"                           0.1      0.28       0.1     0.28
## "is.factor"                     0.1      0.28       0.1     0.28
## "isTRUE"                        0.1      0.28       0.1     0.28
## "ncol"                          0.1      0.28       0.1     0.28
## "numeric"                       0.1      0.28       0.1     0.28
## "pthetaprior"                   0.1      0.28       0.1     0.28
## 
## $sample.interval
## [1] 0.1
## 
## $sampling.time
## [1] 36.3
C<-vcv(exampleTree)

lnL_C<-function(x, C, data) {
  
  # multiply by sig2 to get the real variance-covariance matrix
  sig2 <- x[1]
  V <- sig2 * C
  
  # make a vector of thetas
  n <- dim(C)[1]
  theta <- x[2]
  th <- rep(theta, n)
  
  # calculate the likelihood
  res <- dmvnorm(data, th, V, log=TRUE) # return log-likelihood
  return(res)
  
}

myMCMC_faster <- function(ngens) {
  res <- matrix(nrow=ngens, ncol=4)
  colnames(res) <- c("Generation", "lnL", "sig2", "theta")

  sig2_accept <- 0
  theta_accept <- 0
  
  for(i in 1:ngens) {
    # propose new parameter values
    # component-wise sampling
    if(i %% 2 != 0) {
      proposed_sig2 <- current_sig2 + runif(1, min=-wp_sig2/2, max=wp_sig2/2)
      proposed_theta <- current_theta
    } else {
      proposed_theta <- current_theta + runif(1, min=-wp_theta/2, max=wp_theta/2)
      proposed_sig2 <- current_sig2
    }
    
    # calulate three ratios
    # prior odds, proposal density, likelihood ratios
    # all on a ln-scale
    lnprior_odds_ratio <- psig2prior(proposed_sig2) - psig2prior(current_sig2) + pthetaprior(proposed_theta) - pthetaprior(current_theta)
    
    lnproposal_density_ratio <- 0
    
    lnlikelihood_ratio <- lnL_C(x=c(proposed_sig2, proposed_theta), C, exampleData) - lnL_C(x=c(current_sig2, current_theta), C, exampleData)
    
    # calculate acceptance probability
    lnacceptance_ratio <- lnprior_odds_ratio + lnproposal_density_ratio + lnlikelihood_ratio
    acceptance_ratio <- exp(lnacceptance_ratio)
    
    if(runif(1)<acceptance_ratio) { # accept the proposal
      current_sig2 <- proposed_sig2
      current_theta <- proposed_theta
      if(i %% 2 != 0) {
        sig2_accept <- sig2_accept + 1
      } else {
        theta_accept <- theta_accept + 1
      }
    }
    
    thisGenlnL <- lnL_C(x=c(current_sig2, current_theta), C, exampleData)
    res[i,] <- c(i, thisGenlnL, current_sig2, current_theta)
    
    if(i %% 1000 == 0) cat(i, " generation done\n")
      
  }
  cat("sig2 accepted: ", sig2_accept," out of ", i/2, ", ratio: ", sig2_accept/(i/2), "\n")
  cat("theta accepted: ", theta_accept," out of ", i/2, ", ratio: ", theta_accept/(i/2), "\n")

  return(res)
  
}

chain4<-myMCMC_faster(10000)
## 1000  generation done
## 2000  generation done
## 3000  generation done
## 4000  generation done
## 5000  generation done
## 6000  generation done
## 7000  generation done
## 8000  generation done
## 9000  generation done
## 10000  generation done
## sig2 accepted:  3439  out of  5000 , ratio:  0.6878 
## theta accepted:  4052  out of  5000 , ratio:  0.8104

Fastest possible way to do this.

  1. Calculate PICs given tree and data. Just do this once, not in the MCMC loop.
  2. The mcmc loop uses these contrasts instead of the tree and data structure we were using before.
  3. Inside mcmc loop: