README.Rmd

---
title: "Real-time estimation of the novel coronavirus incubation time"
output: md_document
editor_options: 
  chunk_output_type: console
---
[![DOI](https://zenodo.org/badge/236349745.svg)](https://zenodo.org/badge/latestdoi/236349745)

```{r opts, include=FALSE}
knitr::opts_knit$set(root.dir = here::here())
knitr::opts_chunk$set(echo=F, message=F, warning=F, eval=T,
                      fig.align='center',fig.pos='ht')
```

```{r load, include=FALSE}
library(tidyverse)
library(lubridate)
library(coarseDataTools)
library(gridExtra)
library(ggpubr)
# devtools::install_github("salauer/activemonitr")
library(activeMonitr)
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

set.seed(1)

## read in coronavirus data
ncov_raw <- read_csv("data/nCoV-IDD-traveler-data.csv") %>% 
            rename(EL_date=EL, ER_date=ER, SL_date=SL, SR_date=SR)

## change dates to restrict exposure to after 1 December 2019
## add other times where missing
ncov_ELSR <- ncov_raw %>% 
             # if EL is missing or before 1 Dec 2019, use 1 Dec 2019
             mutate(EL_date=ifelse(is.na(EL_date),"2019-12-01 00:00:00", EL_date) %>% 
                            ymd_hms() %>% 
                            if_else(. < ymd_hms("2019-12-01 00:00:00"), ymd_hms("2019-12-01 00:00:00"), .),
             # if SR is missing, use PR
                    SR_date=ifelse(ymd_hms(SR_date) %>% is.na, PR, SR_date) %>% 
                            ymd_hms(),
           # SR_fever is only for cases with confirmed fever dates
                    SR_fever=ymd_hms(SR_fever))

ncov <- ncov_ELSR %>% 
    # if ER is missing, use SR; if SL is missing, use EL
        mutate(ER_date=if_else(is.na(ER_date), SR_date, ymd_hms(ER_date)),
               ER_date=if_else(ER_date>SR_date, SR_date, ER_date),
               SL_date=if_else(is.na(SL_date), EL_date, ymd_hms(SL_date)),
               SL_date=if_else(SL_date<EL_date, EL_date, SL_date),
               SL_fever=if_else(is.na(SL_fever) & !is.na(SR_fever),
                                 SL_date, ymd_hms(SL_fever)),
               SL_fever=if_else(SL_fever<EL_date, EL_date, SL_fever)) %>% 
    # calculate days since 1 Dec 2019
        mutate(EL=difftime(EL_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
                  as.numeric(),
               ER=difftime(ER_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
                   as.numeric(),
               SL=difftime(SL_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
                   as.numeric(),
               SR=difftime(SR_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
                   as.numeric(),
               SL_fever=difftime(SL_fever, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
                   as.numeric(),
               SR_fever=difftime(SR_fever, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
                   as.numeric(),
              PL=difftime(PL, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
                  as.numeric(),
              PR=difftime(PR, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
                  as.numeric(),
              E_int=ER-EL,
              S_int=SR-SL,
              S_fever_int=SR_fever-SL_fever,
              max_inc_int=SR-EL,
              min_inc_int=SL-ER) %>% 
    # remove any entries missing EL, ER, SL, or SR
        filter(!is.na(EL_date), !is.na(ER_date), !is.na(SL_date), !is.na(SR_date)) %>% 
    # remove entries that haven't been reviewed by two people
        filter(!is.na(REVIEWER2), REVIEWER2!="NA") %>% 
    # remove entries with exposure/onset intervals less than 0
    # remove entries where ER greater than SR or EL greater than SL
        filter(E_int > 0, S_int > 0, ER<=SR, SL>=EL)

## Now lets divide data sets by observation type
## only fevers
ncov_fever <- ncov %>% filter(!is.na(SL_fever) | !is.na(SR_fever))
ncov_mild <- ncov %>% filter(is.na(SL_fever) & is.na(SR_fever))

## only travel outside of China
ncov_foreign <- ncov %>% filter(COUNTRY.DEST != "China" | PROVINCE.DEST %in% c("HongKong", "Macau"))

## only fevers outside of China
ncov_foreign_fever <- ncov_foreign %>% filter(!is.na(SL_fever) | !is.na(SR_fever))

## only cases within mainland China
ncov_mainland <- ncov %>% filter(COUNTRY.DEST == "China" & !(PROVINCE.DEST %in% c("HongKong", "Macau")))

## only cases with a defined EL
ncov_EL <- ncov_raw %>% 
    filter(!is.na(EL_date)) %>% 
    # if EL is missing or before 1 Dec 2019, use 1 Dec 2019
    mutate(EL_date=ymd_hms(EL_date),
           # if SR is missing, use PR
           SR_date=ifelse(ymd_hms(SR_date) %>% is.na, PR, SR_date) %>% 
               ymd_hms(),
           # SR_fever is only for cases with confirmed fever dates
           SR_fever=ymd_hms(SR_fever)) %>% 
    # if ER is missing, use SR; if SL is missing, use EL
    mutate(ER_date=if_else(is.na(ER_date), SR_date, ymd_hms(ER_date)),
           SL_date=if_else(is.na(SL_date), EL_date, ymd_hms(SL_date)),
           SL_fever=if_else(is.na(SL_fever) & !is.na(SR_fever),
                            SL_date, ymd_hms(SL_fever))) %>% 
    # calculate days since 1 Dec 2019
    mutate(EL=difftime(EL_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           ER=difftime(ER_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SL=difftime(SL_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SR=difftime(SR_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SL_fever=difftime(SL_fever, ymd_hms("2019-12-01 00:00:00"),
                             units="days") %>% 
               as.numeric(),
           SR_fever=difftime(SR_fever, ymd_hms("2019-12-01 00:00:00"),
                             units="days") %>% 
               as.numeric(),
           PL=difftime(PL, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           PR=difftime(PR, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           E_int=ER-EL,
           S_int=SR-SL,
           S_fever_int=SR_fever-SL_fever,
           max_inc_int=SR-EL,
           min_inc_int=SL-ER) %>% 
    # remove any entries missing EL, ER, SL, or SR
    filter(!is.na(EL_date), !is.na(ER_date), !is.na(SL_date), !is.na(SR_date)) %>% 
    # remove entries that haven't been reviewed by two people
    filter(!is.na(REVIEWER2)) %>% 
    # remove entries with exposure/onset intervals less than 0
    # remove entries where ER greater than SR or EL greater than SL
    filter(E_int > 0, S_int > 0, ER<=SR, SL>=EL, EL>0)

backer_params <- read_csv("data/backer-params.csv")
```

```{r fill-in-values}
## number of regions
num_reg <- ((ncov$COUNTRY.DEST[ncov$COUNTRY.DEST!="China"]) %>%
                unique() %>% length()) +
    ((ncov$PROVINCE.DEST[ncov$COUNTRY.DEST=="China"]) %>% unique() %>% length())
```

```{r fit-dic, results='hide', cache=T}
set.seed(1)
#make the data to pass in to dic.fit
ncov_inc_dat <- ncov %>% 
    mutate(type=as.numeric(S_int==0) + as.numeric(E_int==0)) %>% 
    select(EL, ER, SL, SR, type) %>% 
    as.data.frame()

#fit it...use bootstrap interstions for now. MCMC better?
ncov_inc_fit_asym <- dic.fit(ncov_inc_dat,dist="L",
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))

ncov_inc_fit_boot <- dic.fit(ncov_inc_dat,dist="L", n.boots=1000,
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))
```

```{r fit-dic-gamma, results='hide', cache=T}
set.seed(1)

#fit it...use bootstrap interstions for now. MCMC better?
ncov_gam_fit_boot <- dic.fit(ncov_inc_dat,dist="G", n.boots=1000,
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))
```

```{r fit-dic-weibull, results='hide', cache=T}
set.seed(1)

#fit it...use bootstrap interstions for now. MCMC better?
ncov_wei_fit_asym <- dic.fit(ncov_inc_dat,dist="W",
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))

ncov_wei_fit_boot <- dic.fit(ncov_inc_dat,dist="W", n.boots=1000,
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))
```

```{r fit-dic-erlang, results='hide', cache=T}
set.seed(1)

#fit it...use bootstrap interstions for now. MCMC better?
ncov_erl_fit <- dic.fit.mcmc(ncov_inc_dat,dist="E",
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))
```

```{r fit-fever-dics, results='hide', cache=T}
set.seed(1)
#make the data to pass in to dic.fit
ncov_fever_dic <- ncov_fever %>% 
    select(EL, ER, SL=SL_fever, SR=SR_fever) %>% 
    mutate(type=0) %>% 
    as.data.frame()

#fit it...use bootstrap interstions for now. MCMC better?
ncov_fever_fit_asym <- dic.fit(ncov_fever_dic,dist="L",
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))

ncov_fever_fit_boot <- dic.fit(ncov_fever_dic,dist="L", n.boots=1000,
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))
```

```{r fever-dic-plots, eval=F}
## plot the boot fit and table of intervals
plot(ncov_fever_fit_boot, main="fever-only results")

fever_tbl <- ncov_fever_fit_boot@ests[-c(1:2),-4] %>% 
    as.data.frame()
fever_tbl$diff <- ncov_fever_fit_boot@ests[-c(1:2), "est"] - ncov_inc_fit_boot@ests[-c(1:2), "est"]
knitr::kable(fever_tbl)


## plot the asymptotic fit and table of intervals
# plot(ncov_fever_fit_asym, main="asymptotic results (fever only)")
# ncov_fever_fit_asym
```

```{r fit-foreign-dics, results='hide', cache=T}
set.seed(1)
ncov_foreign_dic <- ncov_foreign %>% 
    select(EL, ER, SL, SR) %>% 
    mutate(type=0) %>% 
    as.data.frame()

ncov_foreign_fit_asym <- dic.fit(ncov_foreign_dic,dist="L",
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))

ncov_foreign_fit_boot <- dic.fit(ncov_foreign_dic,dist="L", n.boots=1000,
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))

```

```{r foreign-dic-plots, eval=F}
## plot the boot fit and table of intervals
plot(ncov_foreign_fit_boot, main="non-Mainland results")

foreign_tbl <- ncov_foreign_fit_boot@ests[-c(1:2),-4] %>% 
    as.data.frame()
foreign_tbl$diff <- ncov_foreign_fit_boot@ests[-c(1:2), "est"] - ncov_inc_fit_boot@ests[-c(1:2), "est"]
knitr::kable(foreign_tbl)

## plot the asymptotic fit and table of intervals
# plot(ncov_foreign_fit_asym, main="asymptotic results (foreign only)")
# ncov_foreign_fit_asym
```

```{r fit-year-dics, results='hide', cache=T}
set.seed(1)
## change dates to restrict exposure to after December 1
## add times where missing
ncov_year_dic <- read_csv("data/nCoV-IDD-traveler-data.csv") %>% 
    rename(EL_date=EL, ER_date=ER, SL_date=SL, SR_date=SR) %>% 
    # if EL is missing or before 1 Dec 2018, use 1 Dec 2018
    mutate(EL_date=ifelse(is.na(EL_date),"2018-12-01 00:00:00", EL_date) %>% 
               ymd_hms() %>% 
               if_else(. < ymd_hms("2018-12-01 00:00:00"),
                       ymd_hms("2018-12-01 00:00:00"), .),
           # if SR is missing, use PR
           SR_date=ifelse(ymd_hms(SR_date) %>% is.na,
                          PR, SR_date) %>% 
               ymd_hms(),
           # SR_fever is only for cases with confirmed fever dates
           SR_fever=ymd_hms(SR_fever)) %>% 
    # if ER is missing, use SR; if SL is missing, use EL
    mutate(ER_date=if_else(is.na(ER_date), SR_date, ymd_hms(ER_date)),
           SL_date=if_else(is.na(SL_date), EL_date, ymd_hms(SL_date)),
           SL_fever= if_else(is.na(SL_fever) & !is.na(SR_fever), 
                             SL_date,
                             ymd_hms(SL_fever))) %>% 
    # calculate days since 1 Dec 2018
    mutate(EL=difftime(EL_date, ymd_hms("2018-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           ER=difftime(ER_date, ymd_hms("2018-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SL=difftime(SL_date, ymd_hms("2018-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SR=difftime(SR_date, ymd_hms("2018-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SL_fever=difftime(SL_fever,
                             ymd_hms("2018-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SR_fever=difftime(SR_fever,
                             ymd_hms("2018-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           E_int=ER-EL,
           S_int=SR-SL,
           S_fever_int=SR_fever-SL_fever) %>% 
    # remove any entries missing EL, ER, SL, or SR
    filter(!is.na(EL_date), !is.na(ER_date), !is.na(SL_date), !is.na(SR_date)) %>% 
    filter(!is.na(REVIEWER2)) %>% 
    # remove entries with exposure/onset intervals less than 0
    # remove entries where ER greater than SR or EL greater than SL
    # remove entries that haven't been reviewed by two people
    filter(E_int > 0, S_int > 0, ER<=SR, SL>=EL) %>% 
    select(EL, ER, SL, SR) %>% 
    mutate(type=0) %>% 
    as.data.frame()

ncov_year_fit_asym <- dic.fit(ncov_year_dic,dist="L",
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))

ncov_year_fit_boot <- dic.fit(ncov_year_dic,dist="L", n.boots=1000,
                             ptiles = c(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))

```

```{r year-dic-plots, eval=F}
## plot the boot fit and table of intervals
plot(ncov_year_fit_boot, main="EL-2018 results")

year_tbl <- ncov_year_fit_boot@ests[-c(1:2),-4] %>% 
    as.data.frame()
year_tbl$diff <- ncov_year_fit_boot@ests[-c(1:2), "est"] - ncov_inc_fit_boot@ests[-c(1:2), "est"]
knitr::kable(year_tbl)
```

# Real-time estimation of the novel coronavirus incubation time

Updated: `r date()`

[Read the medRxiv preprint](https://www.medrxiv.org/content/10.1101/2020.02.02.20020016v1)

Our lab has been collecting data (freely available at [`data/nCoV-IDD-traveler-data.csv`](https://github.com/HopkinsIDD/ncov_incubation/blob/master/data/nCoV-IDD-traveler-data.csv)) on the exposure and symptom onset for novel coronavirus (COVID-19) cases that have been confirmed outside of the Hubei province.
These cases have been confirmed either in other countries or in regions of China with no known local transmission.
We search for news articles and reports in both English and Chinese and abstract the data necessary to estimate the incubation period of COVID-19.
Two team members independently review the full text of each case report to ensure that data is correctly input.
Discrepancies are resolved by discussion and consensus.

Quick links:

- [Data summary](#data-summary)
- [Exposure and symptom onset windows](#exposure-and-symptom-onset-windows)
- [Incubation period estimates](#incubation-period-estimates)
- [Alternate estimates and sensitivity analyses](#alternate-estimates-and-sensitivity-analyses)
- [Comparison to other estimates](#comparison-to-other-estimates)
- [Parameter estimates](#parameter-estimates)
- [Active monitoring analysis](#active-monitoring-analysis)
- [Time to hospitalization](#time-to-hospitalization)

## Data summary

There are `r nrow(ncov)` cases from `r num_reg` countries and provinces outside of Hubei, China.
Of those `r sum(ncov$SEX=="Female", na.rm=T)` are known to be female (`r (100*sum(ncov$SEX=="Female", na.rm=T)/nrow(ncov)) %>% round()`%) and `r sum(ncov$SEX=="Male", na.rm=T)` are male (`r (100*sum(ncov$SEX=="Male", na.rm=T)/nrow(ncov)) %>% round()`%).
The median age is about `r median((ncov$AGEL+ncov$AGER)/2, na.rm=T)` years (IQR: `r quantile((ncov$AGEL+ncov$AGER)/2, probs=.25, na.rm=T)`-`r quantile((ncov$AGEL+ncov$AGER)/2, probs=.75, na.rm=T)`).
`r nrow(ncov) - nrow(ncov_foreign)` cases are from Mainland China (`r (100*(nrow(ncov) - nrow(ncov_foreign))/nrow(ncov)) %>% round`%), while `r nrow(ncov_foreign)` are from the rest of the world (`r (100*nrow(ncov_foreign)/nrow(ncov)) %>% round()`%).
`r nrow(ncov_fever)` cases presented with a fever (`r (100*nrow(ncov_fever)/nrow(ncov)) %>% round()`%).

```{r data-summary, echo=FALSE, message=FALSE, warning=FALSE, fig.cap="This figure displays the exposure and symptom onset windows for each case in our dataset, relative to the right-bound of the exposure window (ER). The blue bars indicate the the exposure windows and the red bars indicate the symptom onset windows for each case. Purple areas are where those two bars overlap."}
dat_sum <- ncov %>%
    mutate(ELnew = EL-ER,
           ERnew = ER-ER,
           Emid = (ELnew + ERnew)/2,
           SLnew = SL-ER,
           SRnew = SR-ER,
           Smid = (SLnew + SRnew)/2,
           PLnew = PL-ER,
           PRnew = PR-ER,
           Pmid = (PLnew + PRnew)/2,
           UID=reorder(UID, SR-EL))

ggplot(dat_sum, aes(y=factor(UID))) + 
    geom_segment(aes(x=ELnew, xend=ERnew, yend=factor(UID)), 
                 color="#0072B2", size=2, alpha=.25) +
    geom_segment(aes(x=SLnew, xend=SRnew, yend=factor(UID)), 
                 size=2, color="#CC0000", alpha=.25) +
    geom_segment(aes(x=PLnew, xend=PRnew, yend=factor(UID)), 
                 size=2, color="#00a841", alpha=.25) +
    geom_point(aes(x=Emid, y=factor(UID)), size=0.5, color="#0072B2") +
    geom_point(aes(x=Smid, y=factor(UID)), size=0.5, color="#CC0000") +
    geom_point(aes(x=Pmid, y=factor(UID)), size=0.5, color="#00a841") +
    geom_segment(aes(x=Emid, xend=Smid, yend=factor(UID)), size=0.33, color="#999999") +
    #ggtitle("Exposure and symptom onset windows") +
    scale_x_continuous("Days from last possible exposure") +
    scale_y_discrete("Case") +
    theme_bw() +
    theme(axis.text.y = element_blank(),
          axis.ticks.y= element_blank(),
          axis.text.x=element_text(color="black"))

```

The bars where the exposure and symptom onset windows completely overlap are frequently travelers from Wuhan who were symptomatic on arrival to another country, that did not release further details.
These cases could have been exposed or symptomatic at any point prior to their trip

## Exposure and symptom onset windows

The necessary components for estimating the incubation period are left and right bounds for the exposure (EL and ER) and symptom onset times (SE and SR) for each case.
We use explicit dates and times when they are reported in the source documents, however when they are not available, we make the following assumptions:

- For cases without a reported right-bound on symptom onset time (SR), we use the time that the case is first presented to a hospital or, lacking that, the time that the source document was published
- For cases without an EL, we use 2019 December 1, which was the onset date for the first reported COVID-19 case; though we will test this assumption later
- For cases without an ER, we use the SR
- For cases without an SL, we use the EL

Under these assumptions, the median exposure interval was `r round(median(ncov$E_int),1)` (range: `r round(min(ncov$E_int),1)`-`r round(max(ncov$E_int),1)`) and the median symptom onset interval was `r round(median(ncov$S_int),1)` (range: `r round(min(ncov$S_int),1)`-`r round(max(ncov$S_int),1)`).

## Incubation period estimates

We estimate the incubation period using the coarseDataTools package based on the paper by [Reich *et al*, 2009](https://onlinelibrary.wiley.com/doi/pdf/10.1002/sim.3659).
We assume a log-normal incubation period and using a bootstrap method for calculating confidence intervals.

The first model we fit is to all of the data and output the median, 2.5th, and 97.5th quantiles (and their confidence intervals):

```{r dic-plots}
## plot the boot fit and table of intervals
ci.col <- rgb(230/255,85/255,13/255,1)

plot(ncov_inc_fit_boot, ylab="Proportion symptomatic cases with symptoms",
     xlab="Days after infection", main="", xlim=c(0,20))
points(y=rep(0.025,2), x=c(ncov_inc_fit_boot@ests['p2.5','CIlow'], ncov_inc_fit_boot@ests['p2.5','CIhigh']), type='l', col=ci.col, lwd=2.5)
points(y=rep(0.5,2), x=c(ncov_inc_fit_boot@ests['p50','CIlow'], ncov_inc_fit_boot@ests['p50','CIhigh']), type='l', col=ci.col, lwd=2.5)
points(y=rep(0.975,2), x=c(ncov_inc_fit_boot@ests['p97.5','CIlow'], ncov_inc_fit_boot@ests['p97.5','CIhigh']), type='l', col=ci.col, lwd=2.5)

knitr::kable(ncov_inc_fit_boot@ests[,-4])
# exp_val <- exp(ncov_inc_fit_boot@ests["meanlog", "est"]+0.5*(ncov_inc_fit_boot@ests["sdlog", "est"])^2)
# print(paste("The estimated mean is", exp_val))
```

The median incubation period lasts `r ncov_inc_fit_boot@ests["p50", "est"]` days (CI: `r ncov_inc_fit_boot@ests["p50", "CIlow"]`-`r ncov_inc_fit_boot@ests["p50", "CIhigh"]`).
The 2.5% of incubation periods pass in less than `r ncov_inc_fit_boot@ests["p2.5", "est"]` days (CI: `r ncov_inc_fit_boot@ests["p2.5", "CIlow"]`-`r ncov_inc_fit_boot@ests["p2.5", "CIhigh"]`), while 97.5% of the population would experience symptoms by `r ncov_inc_fit_boot@ests["p97.5", "est"]` days (CI: `r ncov_inc_fit_boot@ests["p97.5", "CIlow"]`-`r ncov_inc_fit_boot@ests["p97.5", "CIhigh"]`) since their exposure.
The 'meanlog' and 'sdlog' estimates are the median and dispersion parameters for a LogNormal distribution; i.e. we recommend using a LogNormal(`r ncov_inc_fit_boot@ests["meanlog", "est"]`, `r ncov_inc_fit_boot@ests["sdlog", "est"]`) distribution to appropriately represent the incubation time distribution.

## Alternate estimates and sensitivity analyses

### Alternate parameterizations

We fit other commonly-used parameterizations of the incubation period as comparisons to the log-normal distribution: gamma, Weibull, and Erlang. 

```{r other-params}
all_est_plot <- ncov_inc_fit_boot@ests[-2, -4] %>% 
    as.data.frame() %>% 
    mutate(qtile=c("mean", 2.5, 5, 25, 50, 75, 95, 97.5),
           type="log-normal") %>% 
    bind_rows(ncov_gam_fit_boot@ests[-2, -4] %>% 
                  as.data.frame() %>% 
                  # rename(est_fever=est, CIlow_fever=CIlow,
                  #        CIhigh_fever=CIhigh) %>% 
                  mutate(qtile=c("mean",2.5, 5, 25, 50, 75, 95, 97.5),
                         type="gamma")) %>% 
    bind_rows(ncov_wei_fit_boot@ests[-2, -4] %>% 
                  as.data.frame() %>% 
                  # rename(est_foreign=est, CIlow_foreign=CIlow,
                  #        CIhigh_foreign=CIhigh) %>% 
                  mutate(qtile=c("mean", 2.5, 5, 25, 50, 75, 95, 97.5),
                         type="Weibull")) %>% 
    bind_rows(ncov_erl_fit@ests[-2, -4] %>% 
                  as.data.frame() %>% 
                  # rename(est_year=est, CIlow_year=CIlow,
                  #        CIhigh_year=CIhigh) %>% 
                  mutate(qtile=c("mean", 2.5, 5, 25, 50, 75, 95, 97.5),
                         type="Erlang")) %>% 
    mutate(est=ifelse(qtile=="mean", exp(est), est),
           CIlow=ifelse(qtile=="mean", exp(CIlow), CIlow),
           CIhigh=ifelse(qtile=="mean", exp(CIhigh), CIhigh),
           study="JHU-IDD") %>% 
    # bind_rows(other_studies) %>% 
    filter(qtile %in% c(2.5, 50, 97.5))# %>% 
    # select(-CIhigh, -CIlow) %>% 
    # spread(qtile, est, sep="_")

ggplot(data=all_est_plot,
       aes(y=est, ymin=CIlow, ymax=CIhigh, x=as.factor(qtile), color=type)) +
    geom_errorbar(height=0.2, position=position_dodge(0.9)) +
    geom_point(position=position_dodge(0.9)) +
    scale_y_continuous("Incubation time, in days (with 95% CIs)", limits=c(0,16)) +
    scale_x_discrete("Estimate quantile") +
    scale_color_manual("Est\ntype",
                       values=cbbPalette[c(2,6,1,4)]) +
    theme_bw() + coord_flip() +
    theme(axis.text=element_text(color="black"))

dic_lls <- tibble(dist=c("log-normal",
                         "gamma",
                         "Weibull",
                         "Erlang"),
                  ll=c(ncov_inc_fit_boot@loglik,
                       ncov_gam_fit_boot@loglik,
                       ncov_wei_fit_boot@loglik,
                       ncov_erl_fit@loglik))

# ncov_gam_fit_boot
# ncov_wei_fit_boot
# ncov_erl_fit
```

The median estimates are very similar across parameterizations, while the Weibull distribution has a slightly smaller value at the 2.5th percentile and the log-normal distribution has a slightly larger value at the 97.5th percentile.
The log-likelihoods were very similar between distributions; the `r dic_lls$dist[which.max(dic_lls$ll)]` distribution having the largest log-likelihood (`r max(dic_lls$ll) %>% round(2)`) and the `r dic_lls$dist[which.min(dic_lls$ll)]` distribution having the smallest log-likelihood (`r min(dic_lls$ll) %>% round(2)`).

The gamma distribution has an estimated shape parameter of `r ncov_gam_fit_boot@ests["shape","est"] %>% round(2)` (95% CI: `r ncov_gam_fit_boot@ests["shape","CIlow"] %>% round(2)`-`r ncov_gam_fit_boot@ests["shape","CIhigh"] %>% round(2)`) and a scale parameter of `r ncov_gam_fit_boot@ests["scale","est"] %>% round(2)` (95% CI: `r ncov_gam_fit_boot@ests["scale","CIlow"] %>% round(2)`-`r ncov_gam_fit_boot@ests["scale","CIhigh"] %>% round(2)`).
The Weibull distribution has an estimated shape parameter of `r ncov_wei_fit_boot@ests["shape","est"] %>% round(2)` (95% CI: `r ncov_wei_fit_boot@ests["shape","CIlow"] %>% round(2)`-`r ncov_wei_fit_boot@ests["shape","CIhigh"] %>% round(2)`) and a scale parameter of `r ncov_wei_fit_boot@ests["scale","est"] %>% round(2)` (95% CI: `r ncov_wei_fit_boot@ests["scale","CIlow"] %>% round(2)`-`r ncov_wei_fit_boot@ests["scale","CIhigh"] %>% round(2)`).
The Erlang distribution has an estimated shape parameter of `r ncov_erl_fit@ests["shape","est"] %>% round(2)` (95% CI: `r ncov_erl_fit@ests["shape","CIlow"] %>% round(2)`-`r ncov_erl_fit@ests["shape","CIhigh"] %>% round(2)`) and a scale parameter of `r ncov_erl_fit@ests["scale","est"] %>% round(2)` (95% CI: `r ncov_erl_fit@ests["scale","CIlow"] %>% round(2)`-`r ncov_erl_fit@ests["scale","CIhigh"] %>% round(2)`).

### Sensitivity analyses

To make sure that our overall incubation estimates are sound, we ran a few analyses on subsets to see if the results held up.
Since the winter often brings cold air and other pathogens that can cause sore throats and coughs, we ran an analysis using only cases that reported a fever.
Since a plurality of our cases came from Mainland China, where assumptions about local transmission may be less firm, we ran an analysis without those cases.
Finally, we challenge our assumption that unknown ELs can be assumed to be 2019 December 1 ([Nextstrain estimates that it could have happened as early as September](https://nextstrain.org/ncov?dmax=2019-12-04&m=num_date)), by setting unknown ELs to 2018 December 1.

```{r all-sens-plot, warning=F, message=F}
all_sens_plot <- ncov_inc_fit_boot@ests[-2, -4] %>% 
    as.data.frame() %>% 
    mutate(qtile=c("mean", 2.5, 5, 25, 50, 75, 95, 97.5),
           type="all") %>% 
    bind_rows(ncov_fever_fit_boot@ests[-2, -4] %>% 
                  as.data.frame() %>% 
                  # rename(est_fever=est, CIlow_fever=CIlow,
                  #        CIhigh_fever=CIhigh) %>% 
                  mutate(qtile=c("mean",2.5, 5, 25, 50, 75, 95, 97.5),
                         type="fever")) %>% 
    bind_rows(ncov_foreign_fit_boot@ests[-2, -4] %>% 
                  as.data.frame() %>% 
                  # rename(est_foreign=est, CIlow_foreign=CIlow,
                  #        CIhigh_foreign=CIhigh) %>% 
                  mutate(qtile=c("mean", 2.5, 5, 25, 50, 75, 95, 97.5),
                         type="non-Mainland")) %>% 
    bind_rows(ncov_year_fit_boot@ests[-2, -4] %>% 
                  as.data.frame() %>% 
                  # rename(est_year=est, CIlow_year=CIlow,
                  #        CIhigh_year=CIhigh) %>% 
                  mutate(qtile=c("mean", 2.5, 5, 25, 50, 75, 95, 97.5),
                         type="EL-2018")) %>% 
    mutate(est=ifelse(qtile=="mean", exp(est), est),
           CIlow=ifelse(qtile=="mean", exp(CIlow), CIlow),
           CIhigh=ifelse(qtile=="mean", exp(CIhigh), CIhigh),
           study="JHU-IDD") %>% 
    # bind_rows(other_studies) %>% 
    filter(qtile %in% c(2.5, 50, 97.5))# %>% 
    # select(-CIhigh, -CIlow) %>% 
    # spread(qtile, est, sep="_")

ggplot(data=all_sens_plot,
       aes(y=est, ymin=CIlow, ymax=CIhigh, x=as.factor(qtile), color=type)) +
    geom_errorbar(height=0.2, position=position_dodge(0.9)) +
    geom_point(position=position_dodge(0.9)) +
    scale_y_continuous("Incubation time, in days (with 95% CIs)") +
    scale_x_discrete("Estimate quantile") +
    scale_color_manual("Est\ntype",
                       values=cbbPalette[c(1,6,4,7)]) +
    theme_bw() + coord_flip() +
    theme(axis.text=element_text(color="black"))
```

Using only fevers, the estimates are `r min(ncov_fever_fit_boot@ests[-c(1:2),"est"]-ncov_inc_fit_boot@ests[-c(1:2),"est"])` to `r max(ncov_fever_fit_boot@ests[-c(1:2),"est"]-ncov_inc_fit_boot@ests[-c(1:2),"est"])` days longer than the estimates on the full data.
`r sum(ncov_fever$SR<ncov_fever$SR_fever)` of the cases with a fever reported having other symptoms beforehand.
While it may take a little longer for an exposure to cause a fever, the estimates are similar to those of the overall results.
The confidence intervals are wider here at every quantile due to having less data.

Using only cases from outside of Mainland China, the estimates are `r min(ncov_foreign_fit_boot@ests[-c(1:2),"est"]-ncov_inc_fit_boot@ests[-c(1:2),"est"])` to `r max(ncov_foreign_fit_boot@ests[-c(1:2),"est"]-ncov_inc_fit_boot@ests[-c(1:2),"est"])` days longer than the estimates on the full data.
There is a bit of a gap on the long end of the tail, but the confidence intervals overlap for the most part.

When we set the unknown ELs to 2018 December 1 instead of 2019 December 1, the estimates are `r min(ncov_year_fit_boot@ests[-c(1:2),"est"]-ncov_inc_fit_boot@ests[-c(1:2),"est"])` to `r max(ncov_year_fit_boot@ests[-c(1:2),"est"]-ncov_inc_fit_boot@ests[-c(1:2),"est"])` days longer than the estimates on the full data.
Somewhat surprisingly, this changes the estimates less than either of the other alternate estimates. 

## Comparison to other estimates

[Backer, Klinkenberg, & Wallinga](https://www.medrxiv.org/content/10.1101/2020.01.27.20018986v1.full.pdf+html) estimated the incubation period based on 88 early nCoV cases that traveled from Wuhan to other regions in China.
[Li *et al*](https://www.nejm.org/doi/full/10.1056/NEJMoa2001316) estimated the incubation period based on the 10 laboratory-confirmed cases in Wuhan.
A comparison of our incubation periods are shown below:

```{r comparison}
backer_comp <- ncov_inc_fit_boot@ests[-2, -4] %>% 
    as.data.frame() %>% 
    mutate(qtile=c("mean",2.5, 5, 25, 50, 75, 95, 97.5),
           type="log-normal") %>% 
    bind_rows(ncov_gam_fit_boot@ests[-2, -4] %>%
                  as.data.frame() %>%
                  mutate(qtile=c("mean",2.5, 5, 25, 50, 75, 95, 97.5),
                         type="gamma")) %>%
    bind_rows(ncov_wei_fit_boot@ests[-2, -4] %>% 
                  as.data.frame() %>% 
                  mutate(qtile=c("mean",2.5, 5, 25, 50, 75, 95, 97.5),
                         type="Weibull")) %>% 
    bind_rows(ncov_erl_fit@ests[-2, -4] %>%
                  as.data.frame() %>%
                  mutate(qtile=c("mean",2.5, 5, 25, 50, 75, 95, 97.5),
                         type="Erlang")) %>%
    mutate(est=ifelse(qtile=="mean", exp(est), est),
           CIlow=ifelse(qtile=="mean",exp(CIlow), CIlow),
           CIhigh=ifelse(qtile=="mean",exp(CIhigh), CIhigh),
           study="JHU-IDD") %>% 
    filter(qtile %in% c(2.5, 25, 50, 75, 97.5)) %>% 
    select(-CIhigh, -CIlow) %>%
    spread(qtile, est, sep="_") %>%
    mutate(obs=nrow(ncov)) %>% 
    bind_rows(backer_params %>% 
                  mutate(`qtile_2.5`=ifelse(type=="Weibull",
                                          qweibull(.025, par1, par2),
                                          ifelse(type=="gamma",
                                                 qgamma(.025, par1, scale=par2),
                                                 qlnorm(.025, par1, par2))),
                         qtile_25=ifelse(type=="Weibull",
                                          qweibull(.25, par1, par2),
                                          ifelse(type=="gamma",
                                                 qgamma(.25, par1, scale=par2),
                                                 qlnorm(.25, par1, par2))),
                         qtile_50=ifelse(type=="Weibull",
                                          qweibull(.5, par1, par2),
                                          ifelse(type=="gamma",
                                                 qgamma(.5, par1, scale=par2),
                                                 qlnorm(.5, par1, par2))),
                         qtile_75=ifelse(type=="Weibull",
                                          qweibull(.75, par1, par2),
                                          ifelse(type=="gamma",
                                                 qgamma(.75, par1, scale=par2),
                                                 qlnorm(.75, par1, par2))),
                         `qtile_97.5`=ifelse(type=="Weibull",
                                          qweibull(.975, par1, par2),
                                          ifelse(type=="gamma",
                                                 qgamma(.975, par1, scale=par2),
                                                 qlnorm(.975, par1, par2)))) %>% 
                  select(-par1, -par2)) %>% 
    mutate(study_order=ifelse(study=="JHU-IDD", 3,
                              ifelse(study=="Backer 2020", 2, 1)),
           study=reorder(study, study_order))

ggplot(data=backer_comp,
       aes(middle=qtile_50, ymin=qtile_2.5, ymax=qtile_97.5,
           lower=qtile_25, upper=qtile_75, x=type,
           fill=study)) +
    geom_boxplot(stat="identity",
                 position = position_dodge2(preserve = "single")) +
    # geom_point(position=position_dodge(0.9)) +
    scale_y_continuous("Incubation time, in days",
                       limits=c(0,16)) +
    scale_x_discrete("Distribution") +
    scale_fill_manual("Study",
                      breaks=c("JHU-IDD", "Backer 2020", "Li 2020"),
                      values=cbbPalette[c(4,2,3)]) +
    theme_bw() + coord_flip() +
    theme(axis.text=element_text(color="black"))
```

The median estimates from all models lie between `r min(backer_comp$qtile_50) %>% round(2)` and `r max(backer_comp$qtile_50) %>% round(2)`.
The lower and upper tails for our distributions are all closer to the median than from the other studies, whether this is due to differences in data or in estimation methodologies is open for investigation.

## Parameter estimates

For the convenience of researchers who need parameter estimates for making infectious disease models, we include a table of the parameter estimates from our analysis and inferred from the other analyses. The parameters are different for each distribution; par1 and par2 are log-mean and log-sd of the log-normal distribution, while they are the shape and scale parameters for the gamma, Weibull, and Erlang distributions.

```{r param-est}
param_est <- ncov_inc_fit_boot@ests[1:2, "est"] %>% t() %>% 
    as.data.frame() %>% 
    rename(par1=meanlog, par2=sdlog) %>% 
    mutate(study="JHU-IDD",
           type="log-normal",
           obs=nrow(ncov)) %>% 
    bind_rows(ncov_gam_fit_boot@ests[1:2, "est"] %>% t() %>% 
                  as.data.frame() %>% 
                  rename(par1=shape, par2=scale) %>% 
                  mutate(study="JHU-IDD",
                         type="gamma",
                         obs=nrow(ncov))) %>%
    bind_rows(ncov_wei_fit_boot@ests[1:2, "est"] %>% t() %>% 
                  as.data.frame() %>% 
                  rename(par1=shape, par2=scale) %>% 
                  mutate(study="JHU-IDD",
                         type="Weibull",
                         obs=nrow(ncov))) %>% 
    bind_rows(ncov_erl_fit@ests[1:2, "est"] %>% t() %>% 
                  as.data.frame() %>% 
                  rename(par1=shape, par2=scale) %>% 
                  mutate(study="JHU-IDD",
                         type="Erlang",
                         obs=nrow(ncov))) %>% 
    bind_rows(backer_params) %>% 
    select(study, type, obs, par1, par2) %>% 
    mutate(par1=round(par1,2),
           par2=round(par2,2))

knitr::kable(param_est)
```

## Active monitoring analysis

Given these estimates of the incubation period, we predicted the number of symptomatic infections we would expect to miss over the course of an active monitoring program.
We looked at active monitoring durations from 1 to 28 days for groups of 'low risk' (1/10,000 chance of symptomatic infection), 'medium risk' (1/1,000), 'high risk' (1/100), and 'infected' (1/1), similar to the analysis in [Reich *et al* (2018)](https://www.nature.com/articles/s41598-018-19406-x).

```{r active-monitoring-tbl}
# saveRDS(ncov_inc_fit_boot, "generated-data/ln-fit-boot.rds")
# ncov_inc_fit_boot <- readRDS("generated-data/ln-fit-boot.rds")

## function similar to that of activemonitr's plot_risk_uncertainty()
## but for log-normal distribution
exp_miss_inf <- function(dic_preds,
                         phis,
                         durations,
                         max_u=NULL){
    if(!is.null(max_u)){
        # u <- qunif(seq(0,1,.01),0,max_u)
        u <- qexp(seq(0,0.99,.01), -log(1-0.99)/max_u)
    } else u <- 0
    dic_preds$idx <- seq(nrow(dic_preds))
    return(crossing(idx=dic_preds$idx,
                    phi=phis,
                    d=durations,
                    u) %>% 
        left_join(dic_preds, by="idx") %>% 
        mutate(p=plnorm(d+u, par1, par2, lower.tail=F)*phi,
               emi_per_10k=1e4*p)) %>% 
        group_by(idx, phi, d, par1, par2) %>% 
        summarize(p=mean(p))
}

dic_params <- ncov_inc_fit_boot@samples
phis <- c(1, 1/100, 1/1000, 1/10000)
durs <- 1:28

emi_dat <- exp_miss_inf(dic_params, phis, durs) %>%
    group_by(phi, d) %>% 
    summarize(mean_p=mean(p),
              p01=quantile(p, probs=.01),
              p025=quantile(p, probs=.025),
              p50=quantile(p, probs=.5),
              p975=quantile(p, probs=.975),
              p99=quantile(p, probs=.99),) %>% 
    mutate(
        escaped_cases_per_1k_mean=(mean_p * 1e4),
        escaped_cases_per_1k_p01 = (p01 * 10000),
        escaped_cases_per_1k_p025 = (p025 * 10000),
        escaped_cases_per_1k_p50 = (p50 * 10000),
        escaped_cases_per_1k_p975 = (p975 * 10000),
        escaped_cases_per_1k_p99 = (p99 * 10000),
        risk_group = factor(phi, levels=phis,
                            labels=c("Infected (1 in 1)",
                                     "High (1 in 100)",
                                     "Medium (1 in 1,000)",
                                     "Low (1 in 10,000)"))
    ) %>% 
    ungroup()

am_tbl_dat <- emi_dat %>% 
    mutate(risk_group=reorder(risk_group, phi, mean)) %>%
    filter(d %in% c(7, 14, 21, 28)) %>% 
    transmute(`Monitoring duration`=reorder(paste(d, "days"),d),
              risk_group=risk_group,
              est=paste0(escaped_cases_per_1k_mean %>%
                             formatC(digits=1, format="f"),
                         " (", escaped_cases_per_1k_p99 %>% formatC(digits=1, format="f"), ")")) %>% 
    spread(risk_group, est) %>% 
    as.data.frame()

# colnames(am_tbl_dat) <- gsub("\\(", "\n\\(", colnames(am_tbl_dat))
# colnames(am_tbl_dat) <- gsub("0\\)", "0\\)\nMedian (95% CI)", colnames(am_tbl_dat))
# window_tbl <- tableGrob(window_tbl_dat[-1,], theme=ttheme_minimal())
# am_tbl <- ggtexttable(am_tbl_dat,
#                       theme=ttheme("classic", padding=unit(c(8,8), "mm")),
#                       rows=rep("", nrow(am_tbl_dat)))

knitr::kable(am_tbl_dat,
             caption="Mean estimated symptomatic infections missed per 10,000 monitored (99th percentile), by duration of monitoring and level of risk")
```

```{r am-figure}
# grid.arrange(window_fig, window_tbl, nrow=2)
# ggarrange(#am_plot,
#           am_tbl, nrow=1,
#           labels=c("Mean estimated symptomatic infections missed\n           per 10,000 monitored (99th percentile)"),
#           font.label=list(size=13, face="bold"), hjust=-0.35, vjust=5.5)
    
label_dat <- tibble(label_text=c("Mean", "1st percentile", "99th percentile"),
                    label_x=c(18, 7, 21),
                    label_y=c(0.001, 0.001, 0.03))

emi_dat %>% 
    filter(phi==1) %>% 
    select(d, est_mean=escaped_cases_per_1k_mean,
           est_p01=escaped_cases_per_1k_p01,
           est_p99=escaped_cases_per_1k_p99) %>% 
    gather("quantile", "cases", -d) %>%
ggplot() + 
    geom_line(aes(x=d, y=cases/1e4, linetype=quantile)) + 
    geom_text(data=label_dat, aes(x=label_x, y=label_y, label=label_text)) +
    scale_y_log10("Proportion of symptomatic infections\n that have yet to develop symptoms",
                  breaks=10^(-4:0),
                  labels=c("1/10,000", "1/1,000", "1/100", "1/10", "1/1")) +
    scale_x_continuous("Days since infection",
                       breaks=seq(0,28,7)) +
    scale_linetype_discrete("",
                            labels=c("Mean", "1st percentile", "99th percentile"),
                            guide=F) +
    coord_cartesian(ylim=c(1e-4, 1)) +
    theme_bw() +
    theme(axis.text=element_text(color="black"))
```

## Time to hospitalization

We can use the same procedure for estimating the incubation period to estimate the time from symptom onset to hospitalization.

```{r symptom-to-hospital}
## read in coronavirus data
ncov_hosp <- read_csv("data/nCoV-IDD-traveler-data.csv") %>% 
    rename(EL_date=EL, ER_date=ER, SL_date=SL, SR_date=SR,
           PL_date=PL, PR_date=PR) %>% 
    ## change dates to restrict exposure to after December 1
    ## add times where missing
    # if EL is missing or before 1 Dec 2019, use 1 Dec 2019
    mutate(EL_date=ifelse(is.na(EL_date),"2019-12-01 00:00:00", EL_date) %>% 
               ymd_hms() %>% 
               if_else(. < ymd_hms("2019-12-01 00:00:00"),
                       ymd_hms("2019-12-01 00:00:00"), .),
           # if SR is missing, use PR
           SR_date=ifelse(ymd_hms(SR_date) %>% is.na,
                          PR_date, SR_date) %>% 
               ymd_hms(),
           # SR_fever is only for cases with confirmed fever dates
           SR_fever=ymd_hms(SR_fever),
           PR_date=ymd_hms(PR_date)) %>% 
    # if ER is missing, use SR; if SL is missing, use EL; if PL missing use SL
    mutate(ER_date=if_else(is.na(ER_date), SR_date, ymd_hms(ER_date)),
           SL_date=if_else(is.na(SL_date), EL_date, ymd_hms(SL_date)),
           PL_date=if_else(is.na(PL_date), SL_date, ymd_hms(PL_date)),
           SL_fever= if_else(is.na(SL_fever) & !is.na(SR_fever), 
                             SL_date,
                             ymd_hms(SL_fever))) %>% 
    # calculate days since 1 Dec 2019
    mutate(EL=difftime(EL_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           ER=difftime(ER_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SL=difftime(SL_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SR=difftime(SR_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SL_fever=difftime(SL_fever,
                             ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           SR_fever=difftime(SR_fever,
                             ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           PL=difftime(PL_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           PR=difftime(PR_date, ymd_hms("2019-12-01 00:00:00"), units="days") %>% 
               as.numeric(),
           S_int=SR-SL,
           P_int=PR-PL,
           S_fever_int=SR_fever-SL_fever,
           L_diff=PL-SL,
           R_diff=PR-SR) %>% 
    # remove any entries missing EL, ER, SL, or SR
    filter(!is.na(SL), !is.na(SR),!is.na(PL), !is.na(PR)) %>% 
    # remove entries that haven't been reviewed by two people
    filter(!is.na(REVIEWER2)) %>% 
    # remove entries with exposure/onset intervals less than 0
    # remove entries where SR greater than PR or SL greater than PL
    filter(S_int > 0, P_int > 0, SR<=PR, PL>=SL)


```

```{r hosp-data-summary, echo=FALSE, message=FALSE, warning=FALSE, fig.cap="This figure displays the symptom onset and hospitalization windows for each case in our dataset, relative to the right-bound of the symptom onset window (SR). The blue bars indicate the the symptom onset windows and the red bars indicate the hospitalization windows for each case. Purple areas are where those two bars overlap."}
ncov_hosp %>%
    mutate(PLnew = PL-SR,
           PRnew = PR-SR,
           Pmid = (PLnew + PRnew)/2,
           SLnew = SL-SR,
           SRnew = SR-SR,
           Smid = (SLnew + SRnew)/2,
           UID=reorder(UID, PR-SL)) %>% 
    ggplot(aes(y=factor(UID))) + 
    geom_segment(aes(x=SLnew, xend=SRnew, yend=factor(UID)), 
                 color="#0072B2", size=2, alpha=.25) +
    geom_segment(aes(x=PLnew, xend=PRnew, yend=factor(UID)), 
                 size=2, color="#CC0000", alpha=.25) +
    geom_point(aes(x=Smid, y=factor(UID)), size=0.5, color="#0072B2") +
    geom_point(aes(x=Pmid, y=factor(UID)), size=0.5, color="#CC0000") +
    geom_segment(aes(x=Smid, xend=Pmid, yend=factor(UID)), size=0.33, color="#999999") +
    #ggtitle("Exposure and symptom onset windows") +
    scale_x_continuous("Days from last possible time of symptom onset") +
    scale_y_discrete("Case") +
    theme_bw() +
    theme(axis.text.y = element_blank(),
          axis.ticks.y= element_blank(),
          axis.text.x=element_text(color="black"))
```

Of the `r nrow(ncov_hosp)` individuals who developed symptoms in the community (as opposed to in isolation), `r sum((ncov_hosp$R_diff+ncov_hosp$L_diff<=1) & ncov_hosp$P_int<=1)` (`r (100*mean((ncov_hosp$R_diff+ncov_hosp$L_diff<=1) & ncov_hosp$P_int<=1)) %>% round()`%) were hospitalized within a day.

We modeled the time to hospitalization as a gamma distribution:

```{r fit-hosp-dic, results='hide', cache=T}
set.seed(1)
#make the data to pass in to dic.fit
ncov_hosp_dat <- ncov_hosp %>% 
    mutate(type=as.numeric(S_int==0) + as.numeric(P_int==0)) %>% 
    select(SL, SR, PL, PR, type) %>% 
    rename(EL=SL, ER=SR) %>% 
    rename(SL=PL, SR=PR) %>% 
    as.data.frame()

#fit it...use bootstrap interstions for now. MCMC better?
ncov_hosp_fit_boot <- dic.fit(ncov_hosp_dat,dist="G", n.boots=1000,
                              ptiles = c(0.025, 0.25, 0.5, 0.75, 0.975))
```

```{r hosp-plots}
## plot the boot fit and table of intervals
plot(ncov_hosp_fit_boot, ylab="Proportion symptomatic cases to hospital",
     xlab="Days after infection", main="", xlim=c(0,20))
points(y=rep(0.025,2), x=c(ncov_hosp_fit_boot@ests['p2.5','CIlow'], ncov_hosp_fit_boot@ests['p2.5','CIhigh']), type='l', col=ci.col, lwd=2.5)
points(y=rep(0.5,2), x=c(ncov_hosp_fit_boot@ests['p50','CIlow'], ncov_hosp_fit_boot@ests['p50','CIhigh']), type='l', col=ci.col, lwd=2.5)
points(y=rep(0.975,2), x=c(ncov_hosp_fit_boot@ests['p97.5','CIlow'], ncov_hosp_fit_boot@ests['p97.5','CIhigh']), type='l', col=ci.col, lwd=2.5)

knitr::kable(ncov_hosp_fit_boot@ests[,-4])
```

The model estimates that time to hospitalization is `r (ncov_hosp_fit_boot@ests["shape", "est"]*ncov_hosp_fit_boot@ests["scale", "est"]) %>% round(1)` days, on average.
The majority of cases report quickly, though there is a long tail.


```{r fit-hosp-mainland, results='hide', cache=T, include=F}
set.seed(1)
#make the data to pass in to dic.fit
ncov_hosp_ml_dat <- ncov_hosp %>%
    filter(COUNTRY.DEST == "China" &
               !(PROVINCE.DEST %in% c("HongKong", "Macau"))) %>% 
    mutate(type=as.numeric(S_int==0) + as.numeric(P_int==0)) %>% 
    select(SL, SR, PL, PR, type) %>% 
    rename(EL=SL, ER=SR) %>% 
    rename(SL=PL, SR=PR) %>% 
    as.data.frame()

#fit it...use bootstrap interstions for now. MCMC better?
ncov_hosp_ml_fit <- dic.fit(ncov_hosp_ml_dat,dist="G", n.boots=1000,
                            ptiles = c(0.025, 0.25, 0.5, 0.75, 0.975))
```


```{r hosp-plots-ml, include=F}
## plot the boot fit and table of intervals
plot(ncov_hosp_ml_fit, ylab="Proportion symptomatic cases to hospital",
     xlab="Days after infection", main="", xlim=c(0,20))
points(y=rep(0.025,2), x=c(ncov_hosp_ml_fit@ests['p2.5','CIlow'], ncov_hosp_ml_fit@ests['p2.5','CIhigh']), type='l', col=ci.col, lwd=2.5)
points(y=rep(0.5,2), x=c(ncov_hosp_ml_fit@ests['p50','CIlow'], ncov_hosp_ml_fit@ests['p50','CIhigh']), type='l', col=ci.col, lwd=2.5)
points(y=rep(0.975,2), x=c(ncov_hosp_ml_fit@ests['p97.5','CIlow'], ncov_hosp_ml_fit@ests['p97.5','CIhigh']), type='l', col=ci.col, lwd=2.5)

knitr::kable(ncov_hosp_ml_fit@ests[,-4])
```