This document provides a brief tutorial to analyzing twin data using the mets
package:
\(\newcommand{\cov}{\mathbb{C}\text{ov}} \newcommand{\cor}{\mathbb{C}\text{or}} \newcommand{\var}{\mathbb{V}\text{ar}} \newcommand{\E}{\mathbb{E}} \newcommand{\unitfrac}[2]{#1/#2} \newcommand{\n}{}\)
The development version may be installed from github:
# install.packages("remotes")
::install_github("kkholst/mets", dependencies="Suggests") remotes
In the following we examine the heritability of Body Mass Indexkorkeila_bmi_1991 hjelmborg_bmi_2008, based on data on self-reported BMI-values from a random sample of 11,411 same-sex twins. First, we will load data
library(mets)
data("twinbmi")
head(twinbmi)
#> tvparnr bmi age gender zyg id num
#> 1 1 26.33289 57.51212 male DZ 1 1
#> 2 1 25.46939 57.51212 male DZ 1 2
#> 3 2 28.65014 56.62696 male MZ 2 1
#> 5 3 28.40909 57.73097 male DZ 3 1
#> 7 4 27.25089 53.68683 male DZ 4 1
#> 8 4 28.07504 53.68683 male DZ 4 2
The data is on long format with one subject per row.
tvparnr
: twin idbmi
: Body Mass Index (\(\mathrm{kg}/{\mathrm{m}^2}\))age
: Age (years)gender
: Gender factor (male,female)zyg
: zygosity (MZ, DZ)We transpose the data allowing us to do pairwise analyses
<- fast.reshape(twinbmi, id="tvparnr",varying=c("bmi"))
twinwide head(twinwide)
#> tvparnr bmi1 age gender zyg id num bmi2
#> 1 1 26.33289 57.51212 male DZ 1 1 25.46939
#> 3 2 28.65014 56.62696 male MZ 2 1 NA
#> 5 3 28.40909 57.73097 male DZ 3 1 NA
#> 7 4 27.25089 53.68683 male DZ 4 1 28.07504
#> 9 5 27.77778 52.55838 male DZ 5 1 NA
#> 11 6 28.04282 52.52231 male DZ 6 1 22.30936
Next we plot the association within each zygosity group
We here show the log-transformed data which is slightly more symmetric and more appropiate for the twin analysis (see Figure @ref(fig:scatter1) and @ref(fig:scatter2))
<- log(subset(twinwide, zyg=="MZ")[,c("bmi1","bmi2")])
mz scatterdens(mz)
<- log(subset(twinwide, zyg=="DZ")[,c("bmi1","bmi2")])
dz scatterdens(dz)
The plots and raw association measures shows considerable stronger dependence in the MZ twins, thus indicating genetic influence of the trait
cor.test(mz[,1],mz[,2], method="spearman")
#>
#> Spearman's rank correlation rho
#>
#> data: mz[, 1] and mz[, 2]
#> S = 165457624, p-value < 2.2e-16
#> alternative hypothesis: true rho is not equal to 0
#> sample estimates:
#> rho
#> 0.6956209
cor.test(dz[,1],dz[,2], method="spearman")
#>
#> Spearman's rank correlation rho
#>
#> data: dz[, 1] and dz[, 2]
#> S = 2162514570, p-value < 2.2e-16
#> alternative hypothesis: true rho is not equal to 0
#> sample estimates:
#> rho
#> 0.4012686
Ńext we examine the marginal distribution (GEE model with working independence)
<- lm(bmi ~ gender + I(age-40), data=twinbmi)
l0 estimate(l0, id=twinbmi$tvparnr)
#> Estimate Std.Err 2.5% 97.5% P-value
#> (Intercept) 23.3687 0.054534 23.2618 23.4756 0.000e+00
#> gendermale 1.4077 0.073216 1.2642 1.5512 2.230e-82
#> I(age - 40) 0.1177 0.004787 0.1083 0.1271 1.499e-133
library("splines")
<- lm(bmi ~ gender*ns(age,3), data=twinbmi)
l1 <- estimate(l1, id=twinbmi$tvparnr) marg1
<- Expand(twinbmi,
dm bmi=0,
gender=c("male"),
age=seq(33,61,length.out=50))
<- Expand(twinbmi,
df bmi=0,
gender=c("female"),
age=seq(33,61,length.out=50))
plot(marg1, function(p) model.matrix(l1,data=dm)%*%p,
data=dm["age"], ylab="BMI", xlab="Age",
ylim=c(22,26.5))
plot(marg1, function(p) model.matrix(l1,data=df)%*%p,
data=df["age"], col="red", add=TRUE)
legend("bottomright", c("Male","Female"),
col=c("black","red"), lty=1, bty="n")
We can decompose the trait into the following variance components
\[\begin{align*} Y_i = A_i + D_i + C + E_i, \quad i=1,2 \end{align*}\]
Dissimilarity of MZ twins arises from unshared environmental effects only, \(\cor(E_1,E_2)=0\) and
\[\begin{align*} \cor(A_1^{MZ},A_2^{MZ}) = 1, \quad \cor(D_1^{MZ},D_2^{MZ}) = 1, \end{align*}\]
\[\begin{align*} \cor(A_1^{DZ},A_2^{DZ}) = 0.5, \quad \cor(D_1^{DZ},D_2^{DZ}) = 0.25, \end{align*}\]
\[\begin{align*} Y_i = A_i + C_i + D_i + E_i \end{align*}\]
\[\begin{align*} A_i \sim\mathcal{N}(0,\sigma_A^2), C_i \sim\mathcal{N}(0,\sigma_C^2), D_i \sim\mathcal{N}(0,\sigma_D^2), E_i \sim\mathcal{N}(0,\sigma_E^2) \end{align*}\]
\[\begin{gather*} \cov(Y_{1},Y_{2}) = \\ \begin{pmatrix} \sigma_A^2 & 2\Phi\sigma_A^2 \\ 2\Phi\sigma_A^2 & \sigma_A^2 \end{pmatrix} + \begin{pmatrix} \sigma_C^2 & \sigma_C^2 \\ \sigma_C^2 & \sigma_C^2 \end{pmatrix} + \begin{pmatrix} \sigma_D^2 & \Delta_{7}\sigma_D^2 \\ \Delta_{7}\sigma_D^2 & \sigma_D^2 \end{pmatrix} + \begin{pmatrix} \sigma_E^2 & 0 \\ 0 & \sigma_E^2 \end{pmatrix} \end{gather*}\]
<- na.omit(twinbmi)
dd <- twinlm(bmi ~ age+gender, data=dd, DZ="DZ", zyg="zyg", id="tvparnr", type="sat")
l0
# different marginals (but within pair)
<- twinlm(bmi ~ age+gender, data=dd,DZ="DZ", zyg="zyg", id="tvparnr", type="flex")
lf
# same marginals but free correlation with MZ, DZ
<- twinlm(bmi ~ age+gender, data=dd, DZ="DZ", zyg="zyg", id="tvparnr", type="u")
lu estimate(lu,contr(5:6,6))
#> Estimate Std.Err 2.5% 97.5% P-value
#> [atanh(rhoMZ)@1] - [a.... 0.4816 0.04177 0.3997 0.5635 9.431e-31
#>
#> Null Hypothesis:
#> [atanh(rhoMZ)@1] - [atanh(rhoDZ)@2] = 0
estimate(lu)
#> Estimate Std.Err 2.5% 97.5% P-value
#> bmi.1@1 18.6037 0.251036 18.1116 19.0957 0.000e+00
#> bmi.1~age.1@1 0.1189 0.005635 0.1078 0.1299 9.177e-99
#> bmi.1~gendermale.1@1 1.3848 0.086573 1.2151 1.5544 1.376e-57
#> log(var)@1 2.4424 0.022095 2.3991 2.4857 0.000e+00
#> atanh(rhoMZ)@1 0.7803 0.036249 0.7092 0.8513 9.008e-103
#> atanh(rhoDZ)@2 0.2987 0.020953 0.2576 0.3397 4.288e-46
<- twinlm(bmi ~ zyg, data=dd, DZ="DZ", zyg="zyg", id="tvparnr", type="flex")
lf coef(lf)
#> bmi.1@1 bmi.1@2 bmi.1~zygMZ.1@1 log(var(MZ))@1 atanh(rhoMZ)@1
#> 24.5832960 24.6575702 -0.3968696 2.5289387 0.8362321
#> bmi.1~zygMZ.1@2 log(var(DZ))@2 atanh(rhoDZ)@2
#> -0.3968689 2.5659593 0.3860756
###sink("lu-est-summary.txt")
<- twinlm(bmi ~ zyg, data=dd, DZ="DZ", zyg="zyg", id="tvparnr", type="u")
lu summary(lu)
#> ____________________________________________________
#> Group 1: MZ (n=1483)
#> Estimate Std. Error Z value Pr(>|z|)
#> Regressions:
#> bmi.1~zygMZ.1 -0.47114 0.10242 -4.60001 4.225e-06
#> Intercepts:
#> bmi.1 24.65757 0.05614 439.18694 <1e-12
#> Additional Parameters:
#> log(var) 2.55537 0.01699 150.37316 <1e-12
#> atanh(rhoMZ) 0.84865 0.02296 36.96325 <1e-12
#> ____________________________________________________
#> Group 2: DZ (n=2788)
#> Estimate Std. Error Z value Pr(>|z|)
#> Regressions:
#> bmi.1~zygMZ.1 -0.47114 0.10242 -4.60001 4.225e-06
#> Intercepts:
#> bmi.1 24.65757 0.05614 439.18694 <1e-12
#> Additional Parameters:
#> log(var) 2.55537 0.01699 150.37316 <1e-12
#> atanh(rhoDZ) 0.38268 0.01847 20.71970 <1e-12
#>
#> Estimate 2.5% 97.5%
#> Correlation within MZ: 0.69036 0.66607 0.71319
#> Correlation within DZ: 0.36503 0.33325 0.39598
#>
#> 'log Lik.' -22355.15 (df=5)
#> AIC: 44720.3
#> BIC: 44752.1
estimate(lu)
#> Estimate Std.Err 2.5% 97.5% P-value
#> bmi.1@1 24.6576 0.05650 24.5468 24.7683 0.000e+00
#> bmi.1~zygMZ.1@1 -0.4711 0.10155 -0.6702 -0.2721 3.489e-06
#> log(var)@1 2.5554 0.02019 2.5158 2.5949 0.000e+00
#> atanh(rhoMZ)@1 0.8486 0.03554 0.7790 0.9183 5.146e-126
#> atanh(rhoDZ)@2 0.3827 0.02095 0.3416 0.4237 1.545e-74
crossprod(iid(lu))^.5
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.05650271 NaN 0.019575810 0.01346928 NaN
#> [2,] NaN 0.10154621 0.004344490 NaN 0.015803415
#> [3,] 0.01957581 0.00434449 0.020190842 0.01053796 0.006669272
#> [4,] 0.01346928 NaN 0.010537960 0.03554047 NaN
#> [5,] NaN 0.01580342 0.006669272 NaN 0.020950321
###sink()
vcov(lu)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 3.152113e-03 -3.152113e-03 8.675199e-09 4.107103e-09 7.707502e-09
#> [2,] -3.152113e-03 1.049033e-02 -3.973997e-10 3.391234e-09 -5.078878e-09
#> [3,] 8.675199e-09 -3.973997e-10 2.887794e-04 1.367167e-04 9.170282e-05
#> [4,] 4.107103e-09 3.391234e-09 1.367167e-04 5.271249e-04 4.341482e-05
#> [5,] 7.707502e-09 -5.078878e-09 9.170282e-05 4.341482e-05 3.411139e-04
#> attr(,"det")
#> [1] 1.037717e+15
#> attr(,"pseudo")
#> [1] FALSE
#> attr(,"minSV")
#> [1] 85.77516
estimate(lu)
#> Estimate Std.Err 2.5% 97.5% P-value
#> bmi.1@1 24.6576 0.05650 24.5468 24.7683 0.000e+00
#> bmi.1~zygMZ.1@1 -0.4711 0.10155 -0.6702 -0.2721 3.489e-06
#> log(var)@1 2.5554 0.02019 2.5158 2.5949 0.000e+00
#> atanh(rhoMZ)@1 0.8486 0.03554 0.7790 0.9183 5.146e-126
#> atanh(rhoDZ)@2 0.3827 0.02095 0.3416 0.4237 1.545e-74
dim(iid(lu))
#> [1] 4271 5
estimate(lu,contr(4:5,5))
#> Estimate Std.Err 2.5% 97.5% P-value
#> [atanh(rhoMZ)@1] - [a.... 0.466 0.04138 0.3849 0.5471 2.026e-29
#>
#> Null Hypothesis:
#> [atanh(rhoMZ)@1] - [atanh(rhoDZ)@2] = 0
estimate(coef=coef(lu),vcov=vcov(lu),contr(4:5,5))
#> Estimate Std.Err 2.5% 97.5% P-value
#> bmi.1@1 24.6576 0.05614 24.5475 24.7676 0.000e+00
#> bmi.1~zygMZ.1@1 -0.4711 0.10242 -0.6719 -0.2704 4.225e-06
#> log(var)@1 2.5554 0.01699 2.5221 2.5887 0.000e+00
#> atanh(rhoMZ)@1 0.8486 0.02296 0.8036 0.8936 4.462e-299
#> atanh(rhoDZ)@2 0.3827 0.01847 0.3465 0.4189 2.301e-95
wald.test(coef=coef(lu),vcov=vcov(lu),contrast=c(0,0,0,1,-1))
#> lin.comb se lower upper pval
#> B 0.465969 0.0279537 0.4111807 0.5207572 0
#>
#> Wald test
#>
#> data:
#> chisq = 277.87, df = 1, p-value < 2.2e-16
<- twinlm(bmi ~ ns(age,1)+gender, data=twinbmi,
l DZ="DZ", zyg="zyg", id="tvparnr", type="cor", missing=TRUE)
summary(l)
#> ____________________________________________________
#> Group 1
#> Estimate Std. Error Z value Pr(>|z|)
#> Regressions:
#> bmi.1~ns(age, 1).1 4.16937 0.16669 25.01334 <1e-12
#> bmi.1~gendermale.1 1.41160 0.07284 19.37839 <1e-12
#> Intercepts:
#> bmi.1 22.53618 0.07296 308.87100 <1e-12
#> Additional Parameters:
#> log(var) 2.44580 0.01425 171.68256 <1e-12
#> atanh(rhoMZ) 0.78217 0.02290 34.16186 <1e-12
#> ____________________________________________________
#> Group 2
#> Estimate Std. Error Z value Pr(>|z|)
#> Regressions:
#> bmi.1~ns(age, 1).1 4.16937 0.16669 25.01334 <1e-12
#> bmi.1~gendermale.1 1.41160 0.07284 19.37839 <1e-12
#> Intercepts:
#> bmi.1 22.53618 0.07296 308.87100 <1e-12
#> Additional Parameters:
#> log(var) 2.44580 0.01425 171.68256 <1e-12
#> atanh(rhoDZ) 0.29924 0.01848 16.19580 <1e-12
#>
#> Estimate 2.5% 97.5%
#> Correlation within MZ: 0.65395 0.62751 0.67889
#> Correlation within DZ: 0.29061 0.25712 0.32341
#>
#> 'log Lik.' -29020.12 (df=6)
#> AIC: 58052.24
#> BIC: 58093.29
A formal test of genetic effects can be obtained by comparing the MZ and DZ correlation:
estimate(l,contr(5:6,6))
#> Estimate Std.Err 2.5% 97.5% P-value
#> [atanh(rhoMZ)@1] - [a.... 0.4829 0.04176 0.4011 0.5648 6.325e-31
#>
#> Null Hypothesis:
#> [atanh(rhoMZ)@1] - [atanh(rhoDZ)@3] = 0
<- twinlm(bmi ~ ns(age,1)+gender, data=twinbmi,
l DZ="DZ", zyg="zyg", id="tvparnr", type="cor", missing=TRUE)
summary(l)
#> ____________________________________________________
#> Group 1
#> Estimate Std. Error Z value Pr(>|z|)
#> Regressions:
#> bmi.1~ns(age, 1).1 4.16937 0.16669 25.01334 <1e-12
#> bmi.1~gendermale.1 1.41160 0.07284 19.37839 <1e-12
#> Intercepts:
#> bmi.1 22.53618 0.07296 308.87100 <1e-12
#> Additional Parameters:
#> log(var) 2.44580 0.01425 171.68256 <1e-12
#> atanh(rhoMZ) 0.78217 0.02290 34.16186 <1e-12
#> ____________________________________________________
#> Group 2
#> Estimate Std. Error Z value Pr(>|z|)
#> Regressions:
#> bmi.1~ns(age, 1).1 4.16937 0.16669 25.01334 <1e-12
#> bmi.1~gendermale.1 1.41160 0.07284 19.37839 <1e-12
#> Intercepts:
#> bmi.1 22.53618 0.07296 308.87100 <1e-12
#> Additional Parameters:
#> log(var) 2.44580 0.01425 171.68256 <1e-12
#> atanh(rhoDZ) 0.29924 0.01848 16.19580 <1e-12
#>
#> Estimate 2.5% 97.5%
#> Correlation within MZ: 0.65395 0.62751 0.67889
#> Correlation within DZ: 0.29061 0.25712 0.32341
#>
#> 'log Lik.' -29020.12 (df=6)
#> AIC: 58052.24
#> BIC: 58093.29
We also consider the ACE model
<- twinlm(bmi ~ age+gender, data=dd, DZ="DZ", zyg="zyg", id="tvparnr", type="ace") ace0
[korkeila_bmi_1991] Korkeila, Kaprio, Rissanen & Koskenvuo, Effects of gender and age on the heritability of body mass index, Int J Obes, 15(10), 647-654 (1991). ↩︎
[hjelmborg_bmi_2008] Hjelmborg, Fagnani, Silventoinen, McGue, Korkeila, Christensen, Rissanen & Kaprio, Genetic influences on growth traits of BMI: a longitudinal study of adult twins, Obesity (Silver Spring), 16(4), 847-852 (2008). ↩︎