require(pomp)
require(plyr)
require(reshape2)
require(ggplot2)
require(grid)
require(scales)

theme_set(theme_bw())
options(stringsAsFactors=FALSE)

stopifnot(packageVersion("pomp")>="0.58-1")

Set up some facilities for doing parallel computation on a single multi-core machine. Package doMPI is an alternative for use on clusters.

require(foreach)
require(doMC)
require(iterators)
registerDoMC(cores=4)
set.seed(1860853338,kind="L'Ecuyer")

A little function to save doing unnecessary repeats.

juliaChild <- function (file, expr) {
  if (file.exists(file)) {
    readRDS(file)
    } else {
      val <- eval(expr)
      saveRDS(val,file=file)
      val
      }
  }

Trajectory-matching: fitting deterministic model to the data

CtNegBin(ρHt,1/k)
E[Ct|Ht]=ρHt
Var[Ct|Ht]=ρHt+(kρHt)2

Load the Ebola data and construct a pomp object holding the data and a simple SEIR model for it.

baseurl <- "http://kinglab.eeb.lsa.umich.edu/ICTPWID/SaoPaulo_2015/"
source(paste0(baseurl,"data/ebola.R"))

Plot the data.

require(plyr)
require(ggplot2)
require(reshape2)
source(paste0(baseurl,"data/eboladata.R"))
wa <- ddply(dat,~date,summarize,cases=sum(cases,na.rm=TRUE),country="WestAfrica")
dat <- join(dat,wa,type='full')
ggplot(data=dat,mapping=aes(x=date,y=cases,group=country))+geom_line(color='blue')+
  facet_grid(country~.,scales='free_y')

Plot cumulative case counts.

ggplot(data=ddply(dat,~country,mutate,cases=cumsum(cases)),
       mapping=aes(x=date,y=cases,group=country))+geom_line(color='red')+
  labs(y="cumulative cases")+
  facet_grid(country~.,scales='free_y')

Fit the deterministic SEIR model to data from Sierra Leone

juliaChild("tm-fit.rds",{
  starts <- profileDesign(R0=seq(1,2,length=20),
                          upper=c(k=1,rho=0.999),
                          lower=c(k=1e-8,rho=0.001),
                          nprof=10)
  
  profiles <- foreach (start=iter(starts,by='row'),
                       .combine=rbind,.inorder=FALSE,
                       .noexport=c("ebolaModel"),
                       .options.multicore=list(set.seed=TRUE),
                       .options.mpi=list(chunkSize=1,seed=2086168278L,info=TRUE)) %:%
    foreach (type=c("raw","cum"),.combine=rbind,.inorder=FALSE) %dopar%
    {
      tm <- ebolaModel(country="SierraLeone",type=type,na.rm=TRUE)
      coef(tm,names(start)) <- unname(unlist(start))
      if (coef(tm,"rho")==1) coef(tm,"rho") <- 0.999
      if (coef(tm,"rho")==0) coef(tm,"rho") <- 0.001
      tm <- traj.match(tm,est=c("k","rho","E_0","I_0"),transform=TRUE)
      if (coef(tm,"rho")==1) coef(tm,"rho") <- 0.999
      if (coef(tm,"rho")==0) coef(tm,"rho") <- 0.001
      tm <- traj.match(tm,method='subplex',control=list(maxit=1e5))
      pompUnload(tm)
      data.frame(country="SierraLeone",type=type,as.list(coef(tm)),
                 loglik=logLik(tm),conv=tm$convergence)
      }
  }) -> profiles

Profile likelihoods.

require(plyr)
require(reshape2)
require(ggplot2)
theme_set(theme_bw())

profiles$mod <- "det"
profiles <- subset(profiles,country=="SierraLeone"&(mod=="det"|type=="raw"))
profiles <- ddply(profiles,~type+mod,mutate,dll=loglik-max(loglik))

ggplot(data=ddply(profiles,~R0+type+mod,subset,loglik==max(loglik)&dll>-10),
       mapping=aes(x=R0,y=dll,group=interaction(type,mod),
                   color=interaction(type,mod),fill=interaction(type,mod)))+
  geom_point(alpha=0.5)+geom_smooth(method='gam',formula=y~s(x),alpha=0.2)+
  scale_color_manual(name="",values=c(raw.stoch='blue',cum.det='red',cum.stoch='green',raw.det='goldenrod'),
                     labels=c(raw.stoch="stochastic, raw",cum.det="deterministic, cumulative",
                              raw.det="deterministic, raw",cum.stoch="stochastic, cumulative"))+
  scale_fill_manual(name="",values=c(raw.stoch='blue',cum.det='red',cum.stoch='green',raw.det='goldenrod'),
                    labels=c(raw.stoch="stochastic, raw",cum.det="deterministic, cumulative",
                             raw.det="deterministic, raw",cum.stoch="stochastic, cumulative"))+
  geom_hline(yintercept=-0.5*qchisq(p=0.95,df=1))+
  labs(x=expression("basic reproduction number,"~R[0]),y="log scaled likelihood")+
  expand_limits(x=c(1,2))+ylim(c(-10,1))+
  theme(strip.background=element_rect(color=NA,fill=NA),
        text=element_text(size=15),
        strip.text=element_text(face='bold'),
        panel.grid.minor=element_line(color=gray(0.9)),
        legend.position=c(0.8,0.8))

A simulation study.

Generate some simulated data.

nsims <- 50
po <- ebolaModel(country="Guinea",timestep=0.1)

params <- parmat(coef(po),3)
params["k",] <- c(0,0.2,0.5)
paramnames <- names(coef(po))
params
##               [,1]         [,2]         [,3]
## N     1.062897e+07 1.062897e+07 1.062897e+07
## R0    1.400000e+00 1.400000e+00 1.400000e+00
## alpha 6.786632e-01 6.786632e-01 6.786632e-01
## gamma 1.053605e+00 1.053605e+00 1.053605e+00
## rho   2.000000e-01 2.000000e-01 2.000000e-01
## cfr   7.000000e-01 7.000000e-01 7.000000e-01
## k     0.000000e+00 2.000000e-01 5.000000e-01
## S_0   9.999991e-01 9.999991e-01 9.999991e-01
## E_0   4.654124e-07 4.654124e-07 4.654124e-07
## I_0   4.654124e-07 4.654124e-07 4.654124e-07
## R_0   1.000000e-08 1.000000e-08 1.000000e-08
juliaChild("sims.rds",{
  simdat <- simulate(po,params=params,nsim=nsims,seed=208335746L,
                     as.data.frame=TRUE,obs=TRUE)
  rename(simdat,c(time="week")) -> simdat
  mutate(simdat,k=params["k",((as.integer(sim)-1)%%ncol(params))+1])
  }) -> simdat

Typical realizations.

subset(simdat,as.integer(sim)<=12) -> dat
ddply(dat,~k,mutate,sim=as.integer(factor(sim))) -> dat

ggplot(data=dat,
       mapping=aes(x=week,y=cases,group=sim))+
  geom_line()+
  facet_grid(sim~k,
             labeller=labeller(
               sim=label_value,
               k=label_bquote(k==.(x))))+
  theme(strip.background=element_rect(color=NA,fill=NA),
        text=element_text(size=15),
        strip.text=element_text(face='plain'),
        panel.grid.minor=element_line(color=gray(0.9)),
        axis.text.y=element_blank(),
        axis.text.x=element_text(size=rel(0.6)))

For each simulated dataset, fit the deterministic model using trajectory matching.

juliaChild("tm-sim-profiles-R0.rds",{
  fits <- foreach(simul=1:nsims,
                  .combine=rbind,.inorder=FALSE,
                  .noexport=c("ebolaModel"),
                  .options.multicore=list(set.seed=TRUE),
                  .options.mpi=list(chunkSize=10,info=TRUE)) %:%
    foreach(type=c("raw","cum"),.combine=rbind,.inorder=FALSE) %dopar%
    {
      dat <- subset(simdat,sim==simul,select=c(week,cases,deaths))
      tm <- ebolaModel(country="Guinea",data=dat,type=as.character(type))
      st <- params[,(simul-1)%%3+1]
      true.k <- unname(st["k"])
      true.R0 <- unname(st["R0"])
      true.rho <- unname(st["rho"])
      st["k"] <- st["k"]+1e-6
      tm <- traj.match(tm,start=st,est=c("R0","k","rho"),transform=TRUE)
      if (coef(tm,"rho")==1) coef(tm,"rho") <- 0.999
      if (coef(tm,"rho")==0) coef(tm,"rho") <- 0.001
      tm <- traj.match(tm,est=c("R0","k","rho"),method='subplex',transform=TRUE)
      pompUnload(tm)
      data.frame(sim=simul,type=as.character(type),
                 true.k=true.k,true.R0=true.R0,true.rho=true.rho,
                 as.list(coef(tm)),loglik=logLik(tm),
                 conv=tm$convergence)
      }
  
  profiles <- foreach (fit=iter(fits,by="row"),
                       .noexport=c("ebolaModel"),
                       .combine=rbind,.inorder=FALSE) %:%
    foreach (r0=seq(from=0.7,to=3,length=200),
             .combine=rbind,.inorder=FALSE,
             .options.multicore=list(set.seed=TRUE),
             .options.mpi=list(chunkSize=200)
             ) %dopar%
    {
      dat <- subset(simdat,sim==fit$sim,select=c(week,cases,deaths))
      tm <- ebolaModel(country="Guinea",data=dat,type=as.character(fit$type))
      coef(tm) <- unlist(fit[paramnames])
      coef(tm,"R0") <- r0
      if (coef(tm,"rho")==1) coef(tm,"rho") <- 0.999
      if (coef(tm,"rho")==0) coef(tm,"rho") <- 0.001
      tm <- traj.match(tm,est=c("k","rho"),transform=TRUE)
      if (coef(tm,"rho")==1) coef(tm,"rho") <- 0.999
      if (coef(tm,"rho")==0) coef(tm,"rho") <- 0.001
      tm <- traj.match(tm,est=c("k","rho"),transform=TRUE,method='subplex')
      pompUnload(tm)
      data.frame(sim=fit$sim,type=fit$type,
                 true.k=fit$true.k,true.R0=fit$true.R0,true.rho=fit$true.rho,
                 as.list(coef(tm)),loglik=logLik(tm),
                 conv=tm$convergence)
      }
  }) -> profiles

Refine estimates

juliaChild("tm-sim-fits.rds",{
  ddply(profiles,~type+sim+true.k,subset,loglik==max(loglik)) -> starts
  
  fits <- foreach(fit=iter(starts,by="row"),.combine=rbind,.inorder=FALSE,
                  .noexport=c("ebolaModel"),
                  .options.multicore=list(set.seed=TRUE),
                  .options.mpi=list(chunkSize=30)) %dopar%
    {
      dat <- subset(simdat,sim==fit$sim,select=c(week,cases,deaths))
      tm <- ebolaModel(country="Guinea",data=dat,type=as.character(fit$type))
      coef(tm) <- unlist(fit[paramnames])
      if (coef(tm,"rho")==1) coef(tm,"rho") <- 0.999
      if (coef(tm,"rho")==0) coef(tm,"rho") <- 0.001
      tm <- traj.match(tm,est=c("k","rho"),transform=TRUE)
      if (coef(tm,"rho")==1) coef(tm,"rho") <- 0.999
      if (coef(tm,"rho")==0) coef(tm,"rho") <- 0.001
      tm <- traj.match(tm,est=c("k","rho"),transform=TRUE,method='subplex')
      pompUnload(tm)
      data.frame(sim=fit$sim,type=fit$type,
                 true.k=fit$true.k,true.R0=fit$true.R0,true.rho=fit$true.rho,
                 as.list(coef(tm)),loglik=logLik(tm),
                 conv=tm$convergence)
      }
  }) -> fits

Results

pval <- 0.99

ddply(profiles,
      ~type+sim+true.k+true.R0+true.rho,
      summarize,
      maxloglik=max(loglik),
      crit=maxloglik-0.5*qchisq(df=1,p=pval), ## critical LR
      R0lo=min(R0[loglik>=crit]),           ## CI, lower bound
      R0hi=max(R0[loglik>=crit]),           ## CI, upper bound
      R0.CIwidth=R0hi-R0lo,                 ## width of CI
      R0cov=(true.R0>R0lo)&&(true.R0<R0hi)  ## truth in CI?
      ) -> profSummary

ddply(profSummary,
      ~true.k+type,
      summarize,
      R0cov.min=qbeta(p=0.025,
                      shape1=sum(R0cov)+0.5,
                      shape2=sum(!R0cov)+0.5),
      R0cov.max=qbeta(p=0.975,
                      shape1=sum(R0cov)+0.5,
                      shape2=sum(!R0cov)+0.5),
      R0cov=sum(R0cov)/length(R0cov)
      ) -> cover

R0comp <- ggplot(data=fits,
                 mapping=aes(x=factor(true.k),y=R0,
                             group=interaction(true.k,type),
                             fill=type)
                 )+
  stat_boxplot(notch=FALSE,coef=1.5,color='black')+
  geom_hline(aes(yintercept=true.R0),linetype=2)+
  theme_classic()+
  coord_cartesian(ylim=c(1,1.6))+
  scale_fill_manual(values=c(raw='blue',cum='red'),
                    labels=c("raw","cumulative"))+
  guides(fill=FALSE,color=FALSE)+
  labs(x=expression("error overdispersion,"~k),y=expression("estimated"~R[0]))

kcomp <- ggplot(data=fits,
                mapping=aes(x=factor(true.k),y=k,
                            group=interaction(true.k,type),
                            fill=type)
                )+
  stat_boxplot(notch=FALSE,coef=1.5,color='black')+
  annotate("segment",
           x=(1:3)-0.4,xend=(1:3)+0.4,
           y=c(0,0.2,0.5),yend=c(0,0.2,0.5),
           linetype=2)+
  coord_cartesian(ylim=c(-0.1,1.05))+
  theme_classic()+
  scale_fill_manual(values=c(raw='blue',cum='red'),
                    labels=c(raw="raw",cum="cumulative"))+
  labs(x=expression("error overdispersion,"~k),y=expression("estimated"~k),
       fill="Data used:")+
  theme(legend.position=c(0.5,0.8))

R0ci <- ggplot(data=profSummary,
               mapping=aes(x=factor(true.k),y=R0.CIwidth,
                           group=interaction(true.k,type),
                           fill=type)
               )+
  stat_boxplot(notch=FALSE,coef=1.5,color='black')+
  coord_cartesian(ylim=c(0,0.4))+
  labs(x=expression("error overdispersion,"~k),
       y=expression(R[0]~"CI width"))+
  theme_classic()+
  scale_fill_manual(values=c(raw='blue',cum='red'))+
  guides(fill=FALSE,color=FALSE)

R0cov <- ggplot(data=cover,
                mapping=aes(x=factor(true.k),y=R0cov,
                            ymin=R0cov.min,ymax=R0cov.max,
                            color=type))+
  geom_errorbar(size=1,width=0.1)+geom_point(size=2)+
  geom_hline(yintercept=pval,linetype=2)+
  theme_classic()+
  scale_color_manual(values=c(raw='blue',cum='red'))+
  guides(color=FALSE)+
  expand_limits(y=c(0,1))+
  labs(x=expression("error overdispersion,"~k),
       y=expression(R[0]~"coverage"))

Fitting a stochastic model

Initial iterated filtering runs

juliaChild("mifs1.rds",{
  starts <- sobolDesign(lower=c(R0=1,rho=0,k=0.01,E_0=0,I_0=0),
                        upper=c(R0=5,rho=1,k=1,E_0=1e-4,I_0=1e-4),
                        nseq=500)
  
  foreach (ctry=c("SierraLeone","Liberia","Guinea","WestAfrica"),
           .combine=rbind,.inorder=FALSE,
           .noexport=c("ebolaModel"),
           .options.multicore=list(set.seed=TRUE),
           .options.mpi=list(chunkSize=1,seed=762304675L,info=TRUE)
           ) %:%
    foreach (type=c("raw","cum"),.combine=rbind,.inorder=FALSE) %:%
    foreach (start=iter(starts,by='row'),
             .combine=rbind,.inorder=FALSE) %dopar%
    {
      tic <- Sys.time()
      
      po <- ebolaModel(country=ctry,type=type,na.rm=TRUE)
      coef(po,names(start)) <- unname(unlist(start))
      
      ## First refine the initial conditions using trajectory matching or mif
      mf <- mif(po,Nmif=20,
                rw.sd = c(E_0=0.02,I_0=0.02),ivps=c("E_0","I_0"),
                Np=2000,var.factor=2,method="mif2",
                cooling.type="hyperbolic",cooling.fraction=0.1,
                transform=TRUE,verbose=FALSE)
      coef(po,c("S_0","E_0","I_0","R_0")) <- unname(coef(mf,c("S_0","E_0","I_0","R_0")))
      
      mf <- mif(po, Nmif = 10,
                rw.sd = c(R0=0.02,rho=0.02,k=0.02,E_0=0.02,I_0=0.02),
                ivps = c("E_0","I_0"),
                Np = 2000,
                var.factor = 2,
                method = "mif2",
                cooling.type = "hyperbolic",
                cooling.fraction = 0.5,
                transform = TRUE,
                verbose = FALSE)
      mf <- continue(mf, Nmif = 50,
                     cooling.fraction = 0.1,
                     method = "mif2",
                     cooling.type = "hyperbolic")
      
      ## Runs 10 particle filters to assess Monte Carlo error in likelihood
      pf <- replicate(10,pfilter(mf,Np=5000,max.fail=Inf))
      ll <- sapply(pf,logLik)
      ll <- logmeanexp(ll, se = TRUE)
      nfail <- sapply(pf,getElement,"nfail")
      toc <- Sys.time()
      etime <- toc-tic
      units(etime) <- "hours"
      
      pompUnload(mf)
      
      data.frame(country=ctry,type=type,as.list(coef(mf)),
                 loglik = ll[1],
                 loglik.se = ll[2],
                 nfail.min = min(nfail),
                 nfail.max = max(nfail),
                 etime = as.numeric(etime))
      }
  }) -> pars

Compute likelihood profiles.

juliaChild("profile1.rds",{
  pars <- ddply(pars,~country+type,subset,loglik>max(loglik)-20)
  pars <- subset(pars,is.finite(loglik)&nfail.max==0,
                 select=-c(loglik,loglik.se,nfail.max,nfail.min,etime))
  pars <- melt(pars,id=c("country","type"),variable.name="parameter")
  pars <- ddply(pars,~country+type+parameter,summarize,
                min=min(value),max=max(value))
  pars <- melt(pars,id=c("country","type","parameter"))
  
  starts <- daply(pars,country~type~parameter,function(x)range(x$value))
  starts <- adply(starts,c(1,2),function(x)
    profileDesign(R0=seq(from=0.8*x["R0",1],to=1.2*x["R0",2],length=60),
                  lower=x[c("rho","k","S_0","E_0","I_0"),1],
                  upper=x[c("rho","k","S_0","E_0","I_0"),2],
                  nprof=100)
    )
  
  foreach (start=iter(starts,by='row'),
           .combine=rbind,.inorder=FALSE,       
           .options.multicore=list(set.seed=TRUE),
           .options.mpi=list(chunkSize=1,seed=1264624821L),
           .noexport=c("ebolaModel")) %dopar%
    {
      tic <- Sys.time()
      
      ctry <- as.character(start$country)
      type <- as.character(start$type)
      st <- unlist(subset(start,select=-c(country,type)))
      
      po <- ebolaModel(country=ctry,timestep=0.01,type=type,na.rm=TRUE,nstage=3L)
      coef(po,names(st)) <- st
      
      mf <- mif(po, Nmif=10,
                rw.sd = c(rho=0.02,k=0.02,E_0=0.02,I_0=0.02),
                ivps = c("E_0","I_0"),
                Np = 2000,
                var.factor = 2,
                method = "mif2",
                cooling.type = "hyperbolic", 
                cooling.fraction = 0.5,
                transform = TRUE,
                verbose = FALSE)
      mf <- continue(mf, Nmif = 50, cooling.fraction = 0.1)
      
      ## Runs 10 particle filters to assess Monte Carlo error in likelihood
      pf <- replicate(10,pfilter(mf,Np=5000,max.fail=Inf))
      ll <- sapply(pf,logLik)
      ll <- logmeanexp(ll, se = TRUE)
      nfail <- sapply(pf,getElement,"nfail")
      toc <- Sys.time()
      etime <- toc-tic
      units(etime) <- "hours"
      
      ## unload dynamical executable
      pompUnload(po)
      
      data.frame(country=ctry,type=type,as.list(coef(mf)),
                 loglik = ll[1],
                 loglik.se = ll[2],
                 nfail.min = min(nfail),
                 nfail.max = max(nfail),
                 etime = as.numeric(etime))
      }
  }) -> pars

Compute unbiased likelihoods.

juliaChild("profile2.rds",{
  pars <- subset(pars,is.finite(loglik)&nfail.max==0)
  pars <- ddply(pars,~country+type+R0,subset,rank(-loglik)<=5)
  pars <- subset(pars,select=-c(loglik,loglik.se,nfail.max,nfail.min,etime))
  
  results <- foreach (start=iter(pars,by='row'),
                      .combine=rbind,.inorder=FALSE,     
                      .options.multicore=list(set.seed=TRUE), 
                      .options.mpi=list(chunkSize=1,seed=1264624821L),
                      .noexport=c("ebolaModel")) %dopar%
    {
      tic <- Sys.time()
      
      ctry <- as.character(start$country)
      type <- as.character(start$type)
      st <- unlist(subset(start,select=-c(country,type)))
      
      po <- ebolaModel(country=ctry,timestep=0.01,type=type,na.rm=TRUE,nstage=3L)
      coef(po,names(st)) <- unname(st)
      
      ## Runs 10 particle filters to assess Monte Carlo error in likelihood
      pf <- try(replicate(10,pfilter(po,Np=5000,max.fail=Inf)))
      
      ## unload dynamical executable
      pompUnload(po)
      
      toc <- Sys.time()
      etime <- toc-tic
      units(etime) <- "hours"
      
      if (!inherits(pf,"try-error")) {
        
        ll <- sapply(pf,logLik)
        ll <- logmeanexp(ll, se = TRUE)
        nfail <- sapply(pf,getElement,"nfail")
        
        data.frame(country=ctry,type=type,as.list(coef(po)),
                   loglik = ll[1],
                   loglik.se = ll[2],
                   nfail.min = min(nfail),
                   nfail.max = max(nfail),
                   etime = as.numeric(etime))
        
        } else {
          
          data.frame(country=ctry,type=type,as.list(coef(po)),
                     loglik = NA,
                     loglik.se = NA,
                     nfail.min = NA,
                     nfail.max = NA,
                     etime = as.numeric(etime))
          }
      }
  }) -> prof

Profile likelihood plots.

prof <- ddply(prof,~country+type,mutate,dll=loglik-max(loglik),
              country=factor(country,levels=c("Guinea","Liberia","SierraLeone","WestAfrica")))
prof <- ddply(prof,~country+type+R0,subset,loglik==max(loglik))

ggplot(data=prof,
       mapping=aes(x=R0,y=dll,color=type,fill=type))+
  facet_wrap(~country,scales="fixed",ncol=1)+
  geom_point(alpha=0.5)+
  geom_smooth(method='loess',alpha=0.2)+
  scale_color_manual(name="",values=c(raw='blue',cum='red'),
                     labels=c(rawh="raw",cum="cumulative"))+
  scale_fill_manual(name="",values=c(raw='blue',cum='red'),
                    labels=c(rawh="raw",cum="cumulative"))+
  geom_hline(yintercept=-0.5*qchisq(p=0.95,df=1))+
  labs(x = expression("basic reproduction number, "~R[0]),
       y = "log scaled likelihood",
       color = "Data used:", fill="Data used:")+
  xlim(c(1,2.5))+ylim(c(-10,0))+
  theme_classic()+
  theme(strip.background=element_rect(color=NA,fill=NA),
        text=element_text(size=14),
        strip.text=element_text(face='bold'),
        panel.grid.minor=element_line(color=gray(0.9)),
        legend.position=c(0.75,0.89),
        legend.key.size=unit(10,"pt"),
        legend.text=element_text(size=rel(0.8))
        )

Table of MLEs.

mles <- ddply(prof,~country+type,subset,loglik==max(loglik)&type=="raw")
mles <- mutate(mles,odds=(1-rho)/rho)
mles <- subset(mles,select=-c(dll,loglik,nfail.min,nfail.max,S_0,E_0,I_0,R_0,alpha,gamma,cfr))
kable(mles)
country type N R0 rho k loglik.se etime odds
Guinea raw 10628972 1.217663 0.0144752 0.3822749 0.0110781 0.0706478 68.083447
Liberia raw 4092310 1.939128 0.0234293 0.2357219 0.0056410 0.0311655 41.681607
SierraLeone raw 6190280 1.370597 0.1235683 0.0761270 0.0130909 0.0305334 7.092691
WestAfrica raw 20911562 1.477556 0.0061551 0.1891024 0.0147498 0.0653890 161.467758

Diagnostics: does the model fit the data?

Make some typical simulations of the fitted model.

mles <- ddply(prof,~country+type,subset,loglik==max(loglik))
mles <- subset(mles,type=="raw")

time1 <- c(Guinea="2014-01-05",Liberia="2014-06-01",
           SierraLeone="2014-06-08",WestAfrica="2014-01-05")

juliaChild("diagnostics-sim.rds",{
  sims <- foreach (mle=iter(mles,by='row'),.combine=rbind,
                   .options.multicore=list(set.seed=TRUE), 
                   .options.mpi=list(chunkSize=1,seed=1264624821L)
                   ) %dopar%
    {
      country=as.character(mle$country)
      type=as.character(mle$type)
      M <- ebolaModel(country=country,type=type,
                      na.rm=TRUE,nstage=3,timestep=0.01)
      p <- unlist(subset(mle,select=-c(country,type,
                                       loglik,loglik.se,
                                       nfail.min,nfail.max,etime)))
      coef(M,names(p)) <- unname(unlist(p))
      
      sim <- simulate(M,nsim=10,as.data.frame=TRUE,obs=TRUE,include.data=TRUE,
                      seed=2186L)
      
      pompUnload(M)
      
      sim$date <- as.Date(time1[country])+sim$time*7
      sim$country <- country
      sim$type <- type
      sim
      }
  
  mutate(sims,
         data=ifelse(sim=="data","data","simulation"),
         country=mapvalues(country,
                           from=c("SierraLeone","WestAfrica"),
                           to=c("Sierra Leone","West Africa"))
         )
  }) -> sim

Compute some probes (summary statistics) on the simulations and the data.

juliaChild("diagnostics-probes.rds",{
  pbs <- foreach (mle=iter(mles,by='row'),.combine=rbind,
                  .options.multicore=list(set.seed=TRUE), 
                  .options.mpi=list(chunkSize=1,seed=1264624821L)) %dopar%
    {
      country=as.character(mle$country)
      type=as.character(mle$type)
      M <- ebolaModel(country=country,type=type,
                      na.rm=TRUE,nstage=3,timestep=0.01)
      p <- unlist(subset(mle,select=-c(country,type,
                                       loglik,loglik.se,
                                       nfail.min,nfail.max,etime)))
      coef(M,names(p)) <- unname(unlist(p))
      
      pb <- probe(M,probes=list(probe.acf(var="cases",lags=1,type="correlation")),
                  nsim=500,seed=1878812716L)
      pb <- as.data.frame(pb)
      pb <- mutate(pb,
                   sim=rownames(pb),
                   data=ifelse(sim=="data","data","simulation"),
                   type=type,
                   country=country)
      pb
      }
  mutate(pbs,
         country=mapvalues(country,
                           from=c("SierraLeone","WestAfrica"),
                           to=c("Sierra Leone","West Africa")))
  
  }) -> pbs1

Some additional diagnostics.

Among these is exponential growth rate

probe.trend <- function (var) {
  if (length(var) > 1) stop(sQuote("probe.trend")," is a univariate probe")
  function (y) {
    cases <- as.numeric(y[var, ])
    df <- data.frame(week = 1:length(cases), cases = cases)
    M <- lm(log(1 + cases) ~ week, data = df)
    as.numeric(coef(M)[2])
    }
  }

juliaChild("diagnostics-addl-probes.rds",{
  pbs <- foreach (mle=iter(mles,by='row'),.combine=rbind,
                  .options.multicore=list(set.seed=TRUE), 
                  .options.mpi=list(chunkSize=1,seed=1264624821L)) %dopar%
    {
      country=as.character(mle$country)
      type=as.character(mle$type)
      M <- ebolaModel(country=country,type=type,
                      na.rm=TRUE,nstage=3,timestep=0.01)
      p <- unlist(subset(mle,select=-c(country,type,
                                       loglik,loglik.se,
                                       nfail.min,nfail.max,etime)))
      coef(M,names(p)) <- unname(unlist(p))
      
      pb <- probe(M,
                  probes=list(sd=probe.sd(var="cases"),
                              probe.quantile(var="cases",prob=c(0.9)),
                              probe.acf(var="cases",lags=c(1,2,3),type="correlation"),
                              trend=probe.trend(var="cases")),
                  nsim=2000,seed=2186L
                  )
      
      mutate(as.data.frame(pb),
             sim=rownames(pb),
             kind=ifelse(sim=="data","data","simulation"),
             type=type,
             country=country)
      }
  
  pbs <- melt(pbs,id=c("country","type","kind","sim"),variable.name="probe")
  pbs <- mutate(pbs,
                country=mapvalues(country,
                                  from=c("SierraLeone","WestAfrica"),
                                  to=c("Sierra Leone","West Africa")),
                probe=mapvalues(probe,
                                from=c("sd","90%","acf.1.cases","acf.2.cases","acf.3.cases","trend"),
                                to=c("SD","90%ile","ACF(1)","ACF(2)","ACF(3)","TREND")))
  arrange(pbs,country,type,probe,kind,sim)
  }) -> probes

Compare data with simulations.

ddply(pbs1,~country+type,summarize,
      k=sum(acf.1.cases[data=="simulation"]>acf.1.cases[data=="data"]),
      n=length(acf.1.cases[data=="simulation"]),
      pval=(k+0.5)/(n+1)) -> pvals

nsim <- length(unique(sim$sim))-1
npsim <- length(unique(pbs1$sim))-1

mainpl <- ggplot(data=sim,
                 mapping=aes(x=date,y=cases,group=sim,color=data,size=data))+
  geom_line()+
  scale_color_manual(values=c(simulation=gray(0.6),data='blue'))+
  scale_size_manual(values=c(simulation=0.6,data=1.2))+
  facet_wrap(~country,scales='free')+
  labs(color=NA,y="weekly cases",x="date")+
  guides(color=guide_legend(title=NULL),size=guide_legend(title=NULL))+
  theme_classic()+
  theme(strip.background=element_rect(color=NA,fill=NA),
        text=element_text(size=15),
        strip.text=element_text(face='bold'),
        panel.grid.minor=element_line(color=gray(0.9)),
        legend.position=c(0.5,1.05),
        legend.background=element_rect(fill=NA),
        legend.direction='horizontal'
        )

hists <- dlply(pbs1,~country,
               function(x) {
                 pval <- subset(pvals,country==unique(x$country))$pval
                 ggplot(data=subset(x,data=="simulation"),
                        mapping=aes(x=acf.1.cases))+
                   geom_histogram(binwidth=0.05,color='black',fill=gray(0.8),
                                  mapping=aes(y=..density..))+
                   expand_limits(x=c(0,1),y=c(0,4))+
                   labs(x="ACF(1)",y="density")+
                   geom_vline(color='blue',data=subset(x,data=="data"),size=1.2,
                              mapping=aes(xintercept=acf.1.cases))+
                   annotate("text",x=0,y=3.5,hjust=0,
                            label=paste0("P==",signif(pval,2)),parse=TRUE,size=3)+
                   theme_bw()+
                   theme(text=element_text(size=8),
                         plot.background=element_rect(fill=NA,color=NA),
                         panel.border=element_blank(),
                         axis.line.y=element_blank(),
                         axis.text.y=element_blank(),
                         axis.ticks.y=element_blank(),
                         axis.ticks.margin=unit(0,"lines"),
                         panel.grid=element_blank(),
                         plot.margin=unit(c(1,1,0.5,0.5),"lines"))
                 }
               )

ggplot(data=subset(probes,type=="raw"&kind=="simulation"),
       mapping=aes(x=value))+
  geom_density(fill=grey(0.9))+
  facet_wrap(~probe+country,scales='free',ncol=4)+
  geom_vline(data=subset(probes,kind=='data'),
             mapping=aes(xintercept=value),
             color='blue')+
  labs(x="probe value",y="probability density")+
  theme(strip.background=element_rect(color=NA,fill=NA),
        text=element_text(size=14),
        strip.text=element_text(face='plain',size=10),
        panel.grid.minor=element_line(color=gray(0.9)),
        axis.text.y=element_blank(),
        axis.text.x=element_text(size=rel(0.6)))

Forecasting

juliaChild("forecasts.rds",{
  set.seed(988077383L)
  stopifnot(packageVersion("pomp")>="0.58-4")
  
  starts <- c(Guinea="2014-01-05",Liberia="2014-06-01",SierraLeone="2014-06-08")
  
  M0 <- ebolaModel(country="SierraLeone",
                   timestep=0.01,nstageE=3,
                   type="raw",na.rm=TRUE)
  
  datmax <- max(time(M0))
  
  ### DETERMINISTIC MODELS
  
  mles <- readRDS("tm-fit.rds")
  mles <- ddply(mles,~country+type,subset,loglik==max(loglik),
                select=-c(loglik,conv))
  mles <- melt(mles,id=c("country","type"))
  mles <- mutate(mles,value=round(value,6))
  mles <- unique(mles)
  mles <- dcast(mles,country+type~variable,value.var='value')
  
  fc_tm <- foreach (mle=iter(mles,by='row'),nsamp=c(50000,10000),
                    .inorder=TRUE,.combine=rbind) %do%
    {
      theta <- unlist(subset(mle,select=-c(country,type)))
      params <- parmat(theta,nsamp)
      t(sobolDesign(lower=0.8*theta[c("R0","rho","k","S_0","E_0","I_0")],
                    upper=1.2*theta[c("R0","rho","k","S_0","E_0","I_0")],
                    nseq=nsamp)
        ) -> params[c("R0","rho","k","S_0","E_0","I_0"),]
      
      M1 <- ebolaModel(country="SierraLeone",
                       timestep=0.01,nstageE=3,
                       type=as.character(mle$type),na.rm=TRUE)
      
      save.seed <- .Random.seed
      set.seed(923530037L,kind="L'Ecuyer-CMRG")
      
      loglik <- foreach (k=seq_len(ncol(params)),
                         .inorder=TRUE,.combine=c,
                         .noexport=c("M0","M1"),
                         .options.multicore=list(set.seed=TRUE),
                         .options.mpi=list(chunkSize=5,info=TRUE)) %dopar% {
                           tm <- traj.match(M1,start=params[,k])
                           logLik(tm)
                           }
      
      weights <- exp(loglik)/sum(exp(loglik))
      cat("effective sample size for type",sQuote(mle$type),
          "in",mle$country,"=",1/sum(weights^2),"\n")
      
      sample_ind <- sample.int(ncol(params),size=nsamp,replace=TRUE,prob=weights)
      params <- params[,sample_ind]
      
      sims <- foreach (k=seq_len(ncol(params)),
                       .inorder=FALSE,
                       .noexport=c("M0","M1"),
                       .options.multicore=list(set.seed=TRUE),
                       .options.mpi=list(chunkSize=5,info=TRUE)) %dopar%
        {
          traj <- trajectory(M0,times=1:30,params=params[,k])
          dat <- rmeasure(M0,times=1:30,params=params[,k],x=traj)
          data.frame(time=1:30,
                     incidence=traj["N_EI",,,drop=TRUE],
                     mean.cases=params["rho",k]*traj["N_EI",,,drop=TRUE],
                     cases=dat["cases",,,drop=TRUE])
          }
      
      names(sims) <- seq_along(sims)
      ldply(sims,.id='rep') -> sims
      
      .Random.seed <<- save.seed
      
      mutate(sims,type=mle$type,country=mle$country) -> sims
      
      sims
      }
  
  fc_tm$period <- ifelse(fc_tm$time<=datmax,"calibration","projection")
  fc_tm <- subset(fc_tm,select=c(country,type,time,rep,cases,incidence,mean.cases,period))
  
  ### STOCHASTIC MODEL
  
  nsamp <- 1e4
  
  pars <- join(readRDS("mifs1.rds"),readRDS("profile2.rds"),type='full')
  pars <- subset(pars,country=="SierraLeone"&type=="raw")
  mutate(pars,weights=exp(loglik)/sum(exp(loglik))) -> pars
  
  print(with(pars,1/sum(weights^2))) ## Effective sample size
  
  sample_ind <- sample.int(nrow(pars),size=nsamp,replace=TRUE,prob=pars$weights)
  params <- parmat(coef(M0),nrep=nsamp)
  nm <- intersect(rownames(params),names(pars))
  params[nm,] <- t(pars[sample_ind,nm])
  
  pf <- pfilter(M0,params=params,save.params=TRUE,save.states=TRUE,seed=1568335316L)
  
  t <- time(M0)
  acast(melt(pf$saved.params),variable~rep~L1) -> p
  acast(melt(pf$saved.states),variable~rep~L1) -> x
  sim.cases <- laply(seq_along(t),function(k)rmeasure(M0,times=t[k],params=p[,,k],x=x[,,k,drop=FALSE]))
  dimnames(sim.cases) <- list(time=t,observable=c("cases","deaths"),rep=NULL)
  sim.cases <- melt(sim.cases[,"cases",],value.name='cases')
  
  filt_dist <- melt(list(states=pf$saved.states,params=pf$saved.params))
  filt_dist$time <- t[filt_dist$L2]
  filt_dist$L1 <- NULL
  filt_dist$L2 <- NULL
  filt_dist <- dcast(filt_dist,time+rep~variable)
  mutate(filt_dist,incidence=N_EI,mean.cases=rho*incidence) -> filt_dist
  
  filt_dist <- join(filt_dist,sim.cases,type='full',by=c("time","rep"))
  
  M1 <- M0
  time(M1) <- seq(from=18,to=30)
  timezero(M1) <- datmax
  p <- p[,,datmax,drop=TRUE]
  p[c("S_0","E_0","I_0","R_0"),] <- apply(x[,,datmax],2,
                                          function(x)c(x["S"],
                                                       sum(x[c("E1","E2","E3")]),
                                                       x["I"],x["R"]))
  
  sims <- simulate(M1,params=p,states=TRUE,obs=TRUE,seed=499643724L)
  sims <- melt(sims)
  sims$L1 <- NULL
  
  dimnames(p) <- setNames(dimnames(p),c("variable","rep"))
  pred_dist <- join(dcast(sims,time+rep~variable),
                    dcast(melt(p),rep~variable),
                    by='rep',type='left')
  pred_dist <- mutate(pred_dist,time=timezero(M1)+time,
                      incidence=N_EI,mean.cases=rho*incidence)
  
  fc_mf <- ldply(list(calibration=filt_dist,projection=pred_dist),.id='period')
  fc_mf$country <- "SierraLeone"
  fc_mf$type <- "raw"
  fc_mf <- subset(fc_mf,select=c(country,type,time,rep,cases,incidence,mean.cases,period))
  
  fc <- ldply(list(stoch=fc_mf,det=fc_tm),.id='model')
  fc <- ddply(fc,~country,mutate,
              model=factor(model,levels=c("stoch","det")),
              date=as.Date(starts[unique(as.character(country))])+7*(time-1))
  }) -> fc
fc <- ddply(fc,~country+type+model+time+date+period,summarize,
            prob=c(0.025,0.5,0.975),
            quantile=quantile(cases,probs=prob))
fc <- mutate(fc,prob=mapvalues(prob,from=c(0.025,0.5,0.975),
                               to=c("lower","median","upper")))
fc <- dcast(fc,time+date+period+country+type+model~prob,value.var='quantile')

source(paste0(baseurl,"data/eboladata.R"))
dat <- subset(dat,country=="SierraLeone"&is.finite(cases))
dat$period <- "calibration"

ggplot(data=subset(fc,(type=="raw"&model=="stoch")|(type=="cum"&model=="det")),
       mapping=aes(x=date))+
  geom_ribbon(mapping=aes(ymin=lower,ymax=upper,color=model,fill=model,
                          group=interaction(model,period),alpha=period))+
  geom_line(size=1.3,mapping=aes(y=median,color=model,group=interaction(model,period)))+
  geom_point(data=dat,mapping=aes(x=date,y=cases,shape=period),size=3)+
  geom_vline(xintercept=as.numeric(max(subset(fc,period=='calibration')$date)))+
  geom_text(data=data.frame(x=as.Date(c("2014-06-08","2014-10-12")),
                            y=2100,
                            label=c("Calibration","Projection")),
            mapping=aes(x=x,y=y,label=label), hjust=0)+
  scale_alpha_manual(values=c(calibration=0.4,projection=0.2))+
  scale_fill_manual(values=c(det=alpha("red",0.4),stoch=alpha("blue",0.3)),
                    labels=c(stoch="stochastic, raw",
                             det="deterministic, cumulative"))+
  scale_color_manual(values=c(det=alpha("red",0.4),stoch=alpha("blue",0.3)),
                     labels=c(stoch="stochastic, raw",
                              det="deterministic, cumulative"))+
  scale_shape_manual(values=c(calibration=17,projection=19),guide=FALSE)+
  labs(x="Date",y="Weekly reported incidence",fill="model, data",
       color="model, data",alpha="")+
  guides(alpha=FALSE)+
  theme(strip.background=element_rect(color=NA,fill=NA),
        text=element_text(size=15),
        strip.text=element_text(face='bold'),
        panel.grid.minor=element_line(color=gray(0.9)),
        legend.position=c(0.3,0.5)
        )

Discussion

profiles <- ldply(list(det=readRDS("tm-fit.rds"),
                       stoch=readRDS("profile2.rds")),.id='mod')
profiles <- subset(profiles,country=="SierraLeone"&(mod=="det"|type=="raw"))
profiles <- ddply(profiles,~type+mod,mutate,dll=loglik-max(loglik))

ggplot(data=ddply(profiles,~R0+type+mod,subset,loglik==max(loglik)&dll>-10),
       mapping=aes(x=R0,y=dll,group=interaction(type,mod),
                   color=interaction(type,mod),fill=interaction(type,mod)))+
  geom_point(alpha=0.5)+geom_smooth(method='gam',formula=y~s(x),alpha=0.2)+
  scale_color_manual(name="",values=c(raw.stoch='blue',
                                      cum.det='red',cum.stoch='green',
                                      raw.det='goldenrod'),
                     labels=c(raw.stoch="stochastic, raw",
                              cum.det="deterministic, cumulative",
                              raw.det="deterministic, raw",
                              cum.stoch="stochastic, cumulative"))+
  scale_fill_manual(name="",values=c(raw.stoch='blue',cum.det='red',
                                     cum.stoch='green',
                                     raw.det='goldenrod'),
                    labels=c(raw.stoch="stochastic, raw",
                             cum.det="deterministic, cumulative",
                             raw.det="deterministic, raw",
                             cum.stoch="stochastic, cumulative"))+
  geom_hline(yintercept=-0.5*qchisq(p=0.95,df=1))+
  labs(x=expression("basic reproduction number,"~R[0]),
       y="log scaled likelihood")+
  expand_limits(x=c(1,2))+ylim(c(-10,1))+
  theme(strip.background=element_rect(color=NA,fill=NA),
        text=element_text(size=15),
        strip.text=element_text(face='bold'),
        panel.grid.minor=element_line(color=gray(0.9)),
        legend.position=c(0.8,0.8))

fc <- readRDS("forecasts.rds")
fc <- ddply(fc,~country+type+model+time+date+period,summarize,
            prob=c(0.025,0.5,0.975),
            quantile=quantile(mean.cases,probs=prob))
fc <- mutate(fc,prob=mapvalues(prob,from=c(0.025,0.5,0.975),
                               to=c("lower","median","upper")))
fc <- dcast(fc,time+date+period+country+type+model~prob,value.var='quantile')

ggplot(data=fc,mapping=aes(x=date))+
  geom_ribbon(mapping=aes(ymin=lower,ymax=upper,color=interaction(type,model),
                          fill=interaction(type,model),
                          group=interaction(type,model,period),alpha=period))+
  geom_line(size=1.3,mapping=aes(y=median,color=interaction(type,model),
                                 group=interaction(type,model,period)))+
  geom_vline(xintercept=as.numeric(max(subset(fc,period=="calibration")$date)))+
  geom_text(data=data.frame(x=as.Date(c("2014-06-08","2014-10-12")),
                            y=2100,
                            label=c("Calibration","Projection")),
            mapping=aes(x=x,y=y,label=label), hjust=0)+
  scale_alpha_manual(values=c(calibration=0.4,projection=0.2))+
  scale_fill_manual(values=c(cum.det=alpha("red",0.4),
                             raw.stoch=alpha("blue",0.3),
                             raw.det=alpha("goldenrod",0.6)),
                    labels=c(cum.det="deterministic, cumulative",
                             raw.stoch="stochastic, raw",
                             raw.det="deterministic, raw"))+
  scale_color_manual(values=c(cum.det=alpha("red",0.4),
                              raw.stoch=alpha("blue",0.3),
                              raw.det=alpha("goldenrod",0.6)),
                     labels=c(cum.det="deterministic, cumulative",
                              raw.stoch="stochastic, raw",
                              raw.det="deterministic, raw"))+
  labs(x="Date",y="Weekly reported incidence",fill="model, data",
       color="model, data",alpha="")+
  guides(alpha=FALSE)+
  theme_bw()+
  theme(strip.background=element_rect(color=NA,fill=NA),
        text=element_text(size=15),
        strip.text=element_text(face='bold'),
        panel.grid.minor=element_line(color=gray(0.9)),
        legend.position=c(0.3,0.5)
        )

fc <- readRDS("forecasts.rds")
fc <- ddply(fc,~country+type+model+time+date+period,summarize,
            prob=c(0.025,0.5,0.975),
            quantile=quantile(incidence,probs=prob))
fc <- mutate(fc,prob=mapvalues(prob,from=c(0.025,0.5,0.975),
                               to=c("lower","median","upper")))
fc <- dcast(fc,time+date+period+country+type+model~prob,value.var='quantile')

ggplot(data=fc,mapping=aes(x=date))+
  geom_ribbon(mapping=aes(ymin=lower,ymax=upper,color=interaction(type,model),
                          fill=interaction(type,model),
                          group=interaction(type,model,period),alpha=period))+
  geom_line(size=1.3,mapping=aes(y=median,color=interaction(type,model),
                                 group=interaction(type,model,period)))+
  geom_vline(xintercept=as.numeric(max(subset(fc,period=="calibration")$date)))+
  geom_text(data=data.frame(x=as.Date(c("2014-06-08","2014-10-12")),
                            y=12000,
                            label=c("Calibration","Projection")),
            mapping=aes(x=x,y=y,label=label), hjust=0)+
  scale_alpha_manual(values=c(calibration=0.4,projection=0.2))+
  scale_fill_manual(values=c(cum.det=alpha("red",0.4),raw.stoch=alpha("blue",0.3),
                             raw.det=alpha("goldenrod",0.6)),
                    labels=c(cum.det="deterministic, cumulative",
                             raw.stoch="stochastic, raw",
                             raw.det="deterministic, raw"))+
  scale_color_manual(values=c(cum.det=alpha("red",0.4),
                              raw.stoch=alpha("blue",0.3),
                              raw.det=alpha("goldenrod",0.6)),
                     labels=c(cum.det="deterministic, cumulative",
                              raw.stoch="stochastic, raw",
                              raw.det="deterministic, raw"))+
  labs(x="Date",y="True weekly incidence",fill="model, data",
       color="model, data",alpha="")+
  guides(alpha=FALSE)+
  theme_bw()+
  theme(strip.background=element_rect(color=NA,fill=NA),
        text=element_text(size=15),
        strip.text=element_text(face='bold'),
        panel.grid.minor=element_line(color=gray(0.9)),
        legend.position=c(0.3,0.5)
        )