library(data.table)
library(dplyr)
<- fread("2008_Mossong_POLYMOD_participant_common.csv")
d1 # d1 <- fread("data/2008_Mossong_POLYMOD_participant_common.csv")
<- fread("2008_Mossong_POLYMOD_contact_common.csv")
d2 # d2 <- fread("data/2008_Mossong_POLYMOD_contact_common.csv")
# count the number of contacts for each participant using part_id variable
|> group_by(part_id) |>
d2 summarize(contacts = n()) -> d2_contacts
<- left_join(d1, d2_contacts, by="part_id")
d12 # add household information
<- fread("2008_Mossong_POLYMOD_hh_common.csv")
d3 # d3 <- fread("data/2008_Mossong_POLYMOD_hh_common.csv")
<- left_join(d12, d3, by="hh_id")
d123 # add day of week information
<- fread("2008_Mossong_POLYMOD_sday.csv")
d4 # d4 <- fread("data/2008_Mossong_POLYMOD_sday.csv")
<- left_join(d123, d4, by="part_id") dat
Negative binomial regression with censored data: POLYMOD data
This post describes my attempt to reproduce Table 1 of the paper, Social Contacts and Mixing Patterns Relevant to the Spread of Infectious Diseases. Data were downloaded from Social Contact Data, which was hosted in zenodo. I used the version 1.1. In summary, I wasn’t successful at reproducing the table exactly but still wanted to document the processes that I went through.
Data preparation
Data manipulation
Categorize the age group into different 10 age groups: 0-4, 5-9, 10-14, 15-19, and 20 to 70 by 10 years and 70 and above
<- c("0-4","5-9","10-14","15-19","20-29",
age_grp_label "30-39","40-49","50-59","60-69","70+")
$age_grp <- ifelse(is.na(dat$part_age), "Missing",
datifelse(dat$part_age < 20,
floor(dat$part_age/5) + 1],
age_grp_label[ifelse(dat$part_age < 80,
floor(dat$part_age/10) + 3],
age_grp_label[10]))) age_grp_label[
Compare the number of participants by age group (the third column of Table 1)
|> group_by(age_grp) |> summarize(npart=n()) -> npart_ag
dat # hard-coded using the values in Table 1.
$true <- c(660,661,713,685,879,815,908,906,728,270,65)
npart_ag npart_ag
# A tibble: 11 × 3
age_grp npart true
<chr> <int> <dbl>
1 0-4 660 660
2 10-14 713 661
3 15-19 685 713
4 20-29 879 685
5 30-39 815 879
6 40-49 908 815
7 5-9 661 908
8 50-59 906 906
9 60-69 728 728
10 70+ 270 270
11 Missing 65 65
Categorize the household size
<- function(d){
classify_household $hh_size_grp <- "Missing"
d$hh_size_grp <- ifelse(d$hh_size > 5, "6+", as.character(d$hh_size))
dreturn(d)
}<- classify_household(dat) dat
Classify gender
<- function(d) {
classify_gender $gender <- "Missing"
d$gender <- ifelse(d$part_gender == "M", "M",
difelse(d$part_gender == "F", "F", d$gender))
return(d)
}
<- classify_gender(dat) dat
Classify day of week
<- c("Sunday","Monday","Tuesday","Wednesday","Thursday",
dayofweek_label "Friday","Saturday")
<- function(d) {
classify_dayofweek $dayofweek_f <- "Missing"
dfor (i in 1:nrow(d)) {
= d$dayofweek[i]
day if (!is.na(day)) {
$dayofweek_f[i] <- dayofweek_label[day+1]
d
}
}return(d)
}<- classify_dayofweek(dat)
dat # change names for conveninence
$dayofweek_integer <- dat$dayofweek
dat$dayofweek <- dat$dayofweek_f dat
Make categorical variables factor for regression Set reference groups relevel(x, ref=ref)
as in Table 1
# set the categorical variables as factor for regression
$age_grp <- factor(dat$age_grp, levels=c(age_grp_label,"Missing"))
dat$age_grp <- relevel(dat$age_grp, ref="0-4")
dat$gender <- factor(dat$gender, levels=c("F","M","Missing"))
dat$gender <- relevel(dat$gender, ref = "F")
dat$dayofweek <- factor(dat$dayofweek, levels=c(dayofweek_label,"Missing"))
dat$dayofweek <- relevel(dat$dayofweek, ref = "Sunday")
dat$hh_size_grp <- as.factor(dat$hh_size_grp)
dat$hh_size_grp <- relevel(dat$hh_size_grp, ref="1")
dat$country <- factor(dat$country, levels=c("BE","DE","FI","GB","IT","LU","NL","PL"))
dat$country <- relevel(dat$country, ref="BE")
dat
# fwrite(dat, "POLYMOD_2017.csv")
Assign weights to individual participants based on the supplementary Table 2. My approach was to identify a row and a column for the relevant weight based on the age and household size. Weight of an age group for the sample was calculated by dividing the proportion of the age group in the population (in the census) with the proportion of the age group in the sample.
<- function(d) {
find_age_row_column $age_row <- NA
dfor (i in 1:nrow(d)) {
<- d$part_age[i]
ag if(!is.na(ag)){
for (j in 1:14) {
if (ag >= (j-1)*5 & ag < (j-1)*5+5) {
$age_row[i] <- j
dbreak
}else if (ag >= 70) {
$age_row[i] <- 15
dbreak
}else{}
}
}
}
$hh_col <- NA
dfor (i in 1:nrow(d)) {
<- d$hh_size[i]
hs if(!is.na(hs)){
for (j in 1:4) {
if (hs == j) {
$hh_col[i] <- j
dbreak
}else if (hs > 4) {
$hh_col[i] <- 5
d
}else{}
}
}
}return(d)
}
<- find_age_row_column(dat)
dat # wlist <- rio::import_list("data/sampling_weight.xlsx")
<- rio::import_list("sampling_weight.xlsx")
wlist
<- function(d){
classify_weight $wt <- NA
d<- names(wlist)
cnames for (i in 1:length(cnames)) {
<- wlist[[i]]
wtable <- wtable[wtable$`Household size` == "Ratio C/S",] # sampling weight
w1 <- w1[!is.na(w1$`Household size`),] # remove the first row
W # View(W)
for (j in 1:nrow(d)){
if(d$country[j] == cnames[i]) {
$wt[j] <- W[d$age_row[j], d$hh_col[j]+2]
d
}
}
}return(d)
}
# grep("-", as.character(d$wt), value = T)
# hist(as.numeric(d$wt))
<- classify_weight(dat)
dat $wt <- as.numeric(dat$wt) dat
Data for fitting only complete cases
There are missing values for contacts ($n$=36) and weight ($n$=65). It is not clear how those observations were treated in the model. This may be a reason why I can’t reproduce the results in Table 1.
<- dat
dat_ ## would imputation for the 36 observations make a difference?
# dat$contacts_ori <- dat$contacts
# dat$contacts <- ifelse(is.na(dat$contacts), round(mean(dat$contacts, na.rm=T)), dat$contacts)
<- c("contacts", "age_grp", "gender", "dayofweek", "hh_size_grp", "country", "wt")
model_var <- dat[,..model_var]
dat # dat <- dat[complete.cases(dat),]
<- dat[!is.na(contacts),] dat
NegBin regression: no censoring and no weighting
library(MASS)
<- glm.nb(contacts ~ age_grp + gender + dayofweek + hh_size_grp + country,
m data = dat)
# summary(m4)
exp(m$coefficients)
(Intercept) age_grp5-9 age_grp10-14 age_grp15-19
5.6764958 1.4022341 1.6768688 1.6824225
age_grp20-29 age_grp30-39 age_grp40-49 age_grp50-59
1.4458386 1.4168460 1.3868642 1.2940456
age_grp60-69 age_grp70+ age_grpMissing genderM
1.0520198 0.8120547 1.0338752 0.9941243
genderMissing dayofweekMonday dayofweekTuesday dayofweekWednesday
1.3434436 1.3179053 1.3998594 1.3876252
dayofweekThursday dayofweekFriday dayofweekSaturday dayofweekMissing
1.3858679 1.4403113 1.1542837 1.2179674
hh_size_grp2 hh_size_grp3 hh_size_grp4 hh_size_grp5
1.0990550 1.1255471 1.2592436 1.3360603
hh_size_grp6+ countryDE countryFI countryGB
1.4601697 0.7036079 0.9483997 0.9797998
countryIT countryLU countryNL countryPL
1.6351704 1.4081732 1.2493105 1.3223343
Regression that account for censored number of contacts. The paper reads: The data were right censored at 29 contacts for all countries because of a limited number of possible diary entries in some countries
\[ \text{log }L = \sum_{i=1}^n w_i\left(\delta_i~\text{log} \left(P\left(Y=y_i|X\right)\right) + \left(1-\delta_i\right)~\text{log} \left(1-\sum_{i=1}^{28}P(Y=y_i|X)\right) \right) \] , where
\[ \begin{equation} \delta_i=\begin{cases} 1, & \text{if$~y_i<29$}.\\ 0, & \text{otherwise}. \end{cases} \end{equation} \]
Take censoring into account
= model.matrix(m)
X # X <- model.matrix(~ age_grp + gender + dayofweek + hh_size_grp + country, data=dat)
# ini = c(coef(m1), log_theta = log(summary(m1)$theta))
= c(coef(m), size=summary(m)$theta)
init
<- function(par, y, X, ul=400) {
negll_censor # parameters
= par[length(par)]
size = par[-length(par)]
beta # create indicator depending on chosen limit
= y < ul
indicator # linear predictor
= exp(X %*% beta)
mu # log likelihood
= sum(indicator * dnbinom(y, mu=mu, size=size, log=T) +
ll 1-indicator) * log(1-pnbinom(ul, mu=mu, size=size)), na.rm=T)
(return(-ll)
}
# you can check if two methods (glm.nb vs. optim) match by setting ul high (e.g., 100)
<- optim(par=init,
fit1
negll_censor,y = dat$contacts,
X = X,
ul = 29,
method = "Nelder-Mead",
control = list(maxit=1e3, reltol=1e-10))
exp(fit1$par)
(Intercept) age_grp5-9 age_grp10-14 age_grp15-19
5.6047431 1.4335817 1.7328973 1.7131792
age_grp20-29 age_grp30-39 age_grp40-49 age_grp50-59
1.4411616 1.4214491 1.3811132 1.2825334
age_grp60-69 age_grp70+ age_grpMissing genderM
1.0518372 0.8182114 1.0369387 0.9825757
genderMissing dayofweekMonday dayofweekTuesday dayofweekWednesday
1.3945323 1.3504467 1.4340414 1.4146111
dayofweekThursday dayofweekFriday dayofweekSaturday dayofweekMissing
1.4265521 1.4638406 1.1583928 1.2481137
hh_size_grp2 hh_size_grp3 hh_size_grp4 hh_size_grp5
1.0879976 1.1266973 1.2623833 1.3638927
hh_size_grp6+ countryDE countryFI countryGB
1.4878961 0.6934848 0.9561751 1.0065923
countryIT countryLU countryNL countryPL
1.6915581 1.3931403 1.2521232 1.3348934
size
18.9737736
Take censoring & weighting into account
= model.matrix(m)
X # X <- model.matrix(~ age_grp + gender + dayofweek + hh_size_grp + country, data=dat) # full matrix
# ini = c(coef(m1), log_theta = log(summary(m1)$theta))
= c(coef(m), size=summary(m)$theta)
init
<- function(par, y, X, wt, ul=400) {
negll_censor_weight # parameters
= par[length(par)]
size = par[-length(par)]
beta # create indicator depending on chosen limit
= y < ul
indicator # linear predictor
= exp(X %*% beta)
mu # log likelihood
= sum(wt*(indicator * dnbinom(y, mu=mu, size=size, log=T)
ll + (1-indicator) * log(1-pnbinom(ul, mu=mu, size=size))), na.rm=T)
return(-ll)
}
<- optim(par=init,
fit2
negll_censor_weight,y = dat$contacts,
X = X,
wt = dat$wt,
ul = 29,
method = "Nelder-Mead",
control = list(maxit=1e3, reltol=1e-10))
exp(fit2$par)
(Intercept) age_grp5-9 age_grp10-14 age_grp15-19
5.6120605 1.4242420 1.7386774 1.7031201
age_grp20-29 age_grp30-39 age_grp40-49 age_grp50-59
1.4590198 1.4488638 1.3920936 1.3145593
age_grp60-69 age_grp70+ age_grpMissing genderM
1.0690678 0.8092529 0.9944973 0.9919151
genderMissing dayofweekMonday dayofweekTuesday dayofweekWednesday
1.0153858 1.3261171 1.3904167 1.3853431
dayofweekThursday dayofweekFriday dayofweekSaturday dayofweekMissing
1.4001935 1.4212925 1.1905515 1.2452143
hh_size_grp2 hh_size_grp3 hh_size_grp4 hh_size_grp5
1.1014090 1.1254771 1.2788309 1.3717150
hh_size_grp6+ countryDE countryFI countryGB
1.4784786 0.6889027 0.9561525 0.9925714
countryIT countryLU countryNL countryPL
1.6631404 1.4198698 1.3256452 1.3635879
size
17.7725195
<- data.frame(
da parm=c("0-4", "5-9", "10-14", "15-19", "20-29", "30-39", "40-49",
"50-59", "60-69", "70+", "Missing",
"F","M","Missing",
"1", "2", "3", "4", "5", "6+",
"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
"Saturday", "Missing",
"BE", "DE", "FI", "GB", "IT", "LU", "NL", "PL"),
est = c(1.00, 1.42, 1.73, 1.68, 1.45, 1.45, 1.38, 1.31, 1.06, 0.81,0.91,
1.00, 0.99, 1.57,
1.00, 1.17, 1.20, 1.36, 1.46, 1.56,
1.00, 1.33, 1.39, 1.38, 1.41, 1.43, 1.20, 1.24,
1.00, 0.70, 0.94, 0.99, 1.66, 1.42, 1.34, 1.37))
$myest = round(c(1.00, exp(fit2$par)[2:11],
da1.00, exp(fit2$par)[12:13],
1.00, exp(fit2$par)[21:25],
1.00, exp(fit2$par)[14:20],
1.00, exp(fit2$par)[26:32]), digits=2)
da
parm est myest
1 0-4 1.00 1.00
2 5-9 1.42 1.42
3 10-14 1.73 1.74
4 15-19 1.68 1.70
5 20-29 1.45 1.46
6 30-39 1.45 1.45
7 40-49 1.38 1.39
8 50-59 1.31 1.31
9 60-69 1.06 1.07
10 70+ 0.81 0.81
11 Missing 0.91 0.99
12 F 1.00 1.00
13 M 0.99 0.99
14 Missing 1.57 1.02
15 1 1.00 1.00
16 2 1.17 1.10
17 3 1.20 1.13
18 4 1.36 1.28
19 5 1.46 1.37
20 6+ 1.56 1.48
21 Sunday 1.00 1.00
22 Monday 1.33 1.33
23 Tuesday 1.39 1.39
24 Wednesday 1.38 1.39
25 Thursday 1.41 1.40
26 Friday 1.43 1.42
27 Saturday 1.20 1.19
28 Missing 1.24 1.25
29 BE 1.00 1.00
30 DE 0.70 0.69
31 FI 0.94 0.96
32 GB 0.99 0.99
33 IT 1.66 1.66
34 LU 1.42 1.42
35 NL 1.34 1.33
36 PL 1.37 1.36