dat <-read.csv("NHANES1.csv")# make factorsinteger_info <-sapply(dat, is.integer)integer_info[which(names(integer_info) =="age")] <-FALSE# age should stay an integerdat[integer_info] <-lapply(dat[integer_info], as.factor)
Task 1.1
How strong is the relationship between BMI and systolic blood pressure?
summary(dat$bmi)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
13.40 23.80 27.40 28.58 32.00 80.60 290
summary(dat$rr_sys)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
81.33 110.67 120.00 123.00 132.00 234.67 437
cor(dat$bmi, dat$rr_sys,use='complete.obs')
[1] 0.1357793
cor.test(dat$bmi, dat$rr_sys)
Pearson's product-moment correlation
data: dat$bmi and dat$rr_sys
t = 9.1935, df = 4500, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.1069914 0.1643398
sample estimates:
cor
0.1357793
summary(lm(rr_sys~bmi, data = dat))
Call:
lm(formula = rr_sys ~ bmi, data = dat)
Residuals:
Min 1Q Median 3Q Max
-40.973 -12.333 -2.909 8.812 111.049
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 112.64581 1.15331 97.672 <2e-16 ***
bmi 0.36093 0.03926 9.193 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 18.04 on 4500 degrees of freedom
(498 observations deleted due to missingness)
Multiple R-squared: 0.01844, Adjusted R-squared: 0.01822
F-statistic: 84.52 on 1 and 4500 DF, p-value: < 2.2e-16
Task 1.2
Does the relationship between systolic blood pressure and BMI change when you adjust for age (categorized)? Interpret the coefficients of the resulting model (when you mean-center BMI before fitting the model, you can also interpret the intercept).
dat$age_cat <-cut(dat$age, c(0,50,100), right = F) # cut the age in two groupssummary(lm(rr_sys ~ bmi + age_cat, data = dat))
Call:
lm(formula = rr_sys ~ bmi + age_cat, data = dat)
Residuals:
Min 1Q Median 3Q Max
-48.551 -10.478 -1.665 8.369 103.150
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 107.41715 1.06609 100.758 <2e-16 ***
bmi 0.30380 0.03586 8.472 <2e-16 ***
age_cat[50,100) 14.86352 0.49262 30.172 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 16.45 on 4499 degrees of freedom
(498 observations deleted due to missingness)
Multiple R-squared: 0.1836, Adjusted R-squared: 0.1833
F-statistic: 506 on 2 and 4499 DF, p-value: < 2.2e-16
Call:
lm(formula = rr_sys ~ bmi_c + age_cat, data = dat)
Residuals:
Min 1Q Median 3Q Max
-48.551 -10.478 -1.665 8.369 103.150
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 116.09959 0.33445 347.138 <2e-16 ***
bmi_c 0.30380 0.03586 8.472 <2e-16 ***
age_cat[50,100) 14.86352 0.49262 30.172 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 16.45 on 4499 degrees of freedom
(498 observations deleted due to missingness)
Multiple R-squared: 0.1836, Adjusted R-squared: 0.1833
F-statistic: 506 on 2 and 4499 DF, p-value: < 2.2e-16
Task 1.3
Try to find a better model to predict systolic blood pressure by including more covariates. Select a number of candidate covariates which in your opinion may be related to systolic blood pressure, and then choose a model selection strategy and a criterion/test for comparing models. Describe the model with the best fit according to your search, and interpret the model coefficients.
sub_dat <- dat[ ,c('rr_sys', 'bmi', 'male', 'age_cat', 'diab_lft', 'rdyfood_prvmo', 'alc_lft', 'smokstat')]complete.data <- sub_dat[complete.cases(sub_dat), ] # complete.cases deletes all rows with at least one NAcomplete.data$bmi <- complete.data$bmi -mean(complete.data$bmi)# full modelmod1 <-lm(rr_sys ~ bmi + male + age_cat + diab_lft + rdyfood_prvmo + alc_lft + smokstat, data=complete.data)summary(mod1)