Exercise 5: Solutions first part

Load the data in R.

dat <- read.csv("NHANES1.csv")

# make factors
integer_info <- sapply(dat, is.integer)
integer_info[which(names(integer_info) == "age")] <- FALSE  # age should stay an integer
dat[integer_info] <- lapply(dat[integer_info], as.factor)

Task 1.1

How strong is the relationship between BMI and systolic blood pressure?

summary(dat$bmi)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  13.40   23.80   27.40   28.58   32.00   80.60     290

summary(dat$rr_sys)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  81.33  110.67  120.00  123.00  132.00  234.67     437

cor(dat$bmi, dat$rr_sys,use='complete.obs')

[1] 0.1357793

cor.test(dat$bmi, dat$rr_sys)


    Pearson's product-moment correlation

data:  dat$bmi and dat$rr_sys
t = 9.1935, df = 4500, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1069914 0.1643398
sample estimates:
      cor 
0.1357793

summary(lm(rr_sys~bmi, data = dat))


Call:
lm(formula = rr_sys ~ bmi, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-40.973 -12.333  -2.909   8.812 111.049 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 112.64581    1.15331  97.672   <2e-16 ***
bmi           0.36093    0.03926   9.193   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 18.04 on 4500 degrees of freedom
  (498 observations deleted due to missingness)
Multiple R-squared:  0.01844,   Adjusted R-squared:  0.01822 
F-statistic: 84.52 on 1 and 4500 DF,  p-value: < 2.2e-16

Task 1.2

Does the relationship between systolic blood pressure and BMI change when you adjust for age (categorized)? Interpret the coefficients of the resulting model (when you mean-center BMI before fitting the model, you can also interpret the intercept).

dat$age_cat <- cut(dat$age, c(0,50,100), right = F)  # cut the age in two groups

summary(lm(rr_sys ~ bmi + age_cat, data = dat))


Call:
lm(formula = rr_sys ~ bmi + age_cat, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-48.551 -10.478  -1.665   8.369 103.150 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     107.41715    1.06609 100.758   <2e-16 ***
bmi               0.30380    0.03586   8.472   <2e-16 ***
age_cat[50,100)  14.86352    0.49262  30.172   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 16.45 on 4499 degrees of freedom
  (498 observations deleted due to missingness)
Multiple R-squared:  0.1836,    Adjusted R-squared:  0.1833 
F-statistic:   506 on 2 and 4499 DF,  p-value: < 2.2e-16

dat$bmi_c <- dat$bmi - mean(dat$bmi, na.rm=T)
summary(lm(rr_sys ~ bmi_c + age_cat, data=dat))


Call:
lm(formula = rr_sys ~ bmi_c + age_cat, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-48.551 -10.478  -1.665   8.369 103.150 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     116.09959    0.33445 347.138   <2e-16 ***
bmi_c             0.30380    0.03586   8.472   <2e-16 ***
age_cat[50,100)  14.86352    0.49262  30.172   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 16.45 on 4499 degrees of freedom
  (498 observations deleted due to missingness)
Multiple R-squared:  0.1836,    Adjusted R-squared:  0.1833 
F-statistic:   506 on 2 and 4499 DF,  p-value: < 2.2e-16

Task 1.3

Try to find a better model to predict systolic blood pressure by including more covariates. Select a number of candidate covariates which in your opinion may be related to systolic blood pressure, and then choose a model selection strategy and a criterion/test for comparing models. Describe the model with the best fit according to your search, and interpret the model coefficients.

sub_dat <- dat[ ,c('rr_sys', 'bmi', 'male', 'age_cat', 'diab_lft', 'rdyfood_prvmo', 'alc_lft', 'smokstat')]
complete.data <- sub_dat[complete.cases(sub_dat), ]  # complete.cases deletes all rows with at least one NA
complete.data$bmi <- complete.data$bmi - mean(complete.data$bmi)

# full model
mod1 <- lm(rr_sys ~ bmi + male + age_cat + diab_lft + rdyfood_prvmo + alc_lft + smokstat, data=complete.data)
summary(mod1)


Call:
lm(formula = rr_sys ~ bmi + male + age_cat + diab_lft + rdyfood_prvmo + 
    alc_lft + smokstat, data = complete.data)

Residuals:
    Min      1Q  Median      3Q     Max 
-55.566 -10.612  -1.335   8.169  88.949 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     117.45866    2.28201  51.471  < 2e-16 ***
bmi               0.26921    0.07083   3.801  0.00015 ***
maleTRUE          3.89635    0.95159   4.095 4.46e-05 ***
age_cat[50,100)  12.73076    0.96987  13.126  < 2e-16 ***
diab_lft2        14.75862    6.99873   2.109  0.03514 *  
diab_lft3         2.35321    1.32221   1.780  0.07533 .  
rdyfood_prvmo1   -4.15966    1.66104  -2.504  0.01238 *  
rdyfood_prvmo2   -2.61710    1.95104  -1.341  0.18001    
rdyfood_prvmo3   -0.99637    2.66660  -0.374  0.70872    
rdyfood_prvmo4    2.11493    2.32854   0.908  0.36389    
rdyfood_prvmo5    2.68267    3.47250   0.773  0.43992    
rdyfood_prvmo6    4.60867    6.05242   0.761  0.44651    
rdyfood_prvmo7    7.38264    7.64957   0.965  0.33465    
rdyfood_prvmo8    3.08038    3.94554   0.781  0.43509    
rdyfood_prvmo10  -2.44552    3.67957  -0.665  0.50640    
rdyfood_prvmo12   0.49302    5.41527   0.091  0.92747    
rdyfood_prvmo14 -15.55326   12.08377  -1.287  0.19826    
rdyfood_prvmo15  -7.71514    4.58557  -1.682  0.09269 .  
rdyfood_prvmo16  20.99378   17.08501   1.229  0.21935    
rdyfood_prvmo17 -19.99846    7.01877  -2.849  0.00444 ** 
rdyfood_prvmo18  -3.80405   12.08323  -0.315  0.75294    
rdyfood_prvmo20   4.13037    5.71473   0.723  0.46994    
rdyfood_prvmo21  -2.47420    7.66179  -0.323  0.74680    
rdyfood_prvmo25 -10.56513   12.06767  -0.875  0.38145    
rdyfood_prvmo28  -7.21215   17.05012  -0.423  0.67236    
rdyfood_prvmo29  -7.39865   17.37706  -0.426  0.67034    
rdyfood_prvmo30  -1.92757    4.76932  -0.404  0.68615    
rdyfood_prvmo60 -21.08376   17.05149  -1.236  0.21648    
rdyfood_prvmo90  13.51854   12.05563   1.121  0.26233    
alc_lft2          0.01159    3.96368   0.003  0.99767    
alc_lft3         -2.11092    2.17485  -0.971  0.33191    
smokstat2         0.96205    1.68542   0.571  0.56822    
smokstat3         1.16030    0.97446   1.191  0.23396    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 17.02 on 1449 degrees of freedom
Multiple R-squared:  0.1669,    Adjusted R-squared:  0.1485 
F-statistic: 9.073 on 32 and 1449 DF,  p-value: < 2.2e-16

# stepwise selection
library(MASS)
mod2 <- stepAIC(mod1)

Start:  AIC=8433.67
rr_sys ~ bmi + male + age_cat + diab_lft + rdyfood_prvmo + alc_lft + 
    smokstat

                Df Sum of Sq    RSS    AIC
- rdyfood_prvmo 23      8872 428594 8418.7
- alc_lft        2       373 420095 8431.0
- smokstat       2       429 420151 8431.2
<none>                       419722 8433.7
- diab_lft       2      2109 421832 8437.1
- bmi            1      4184 423907 8446.4
- male           1      4856 424579 8448.7
- age_cat        1     49908 469631 8598.2

Step:  AIC=8418.67
rr_sys ~ bmi + male + age_cat + diab_lft + alc_lft + smokstat

           Df Sum of Sq    RSS    AIC
- alc_lft   2       340 428934 8415.8
- smokstat  2       361 428955 8415.9
<none>                  428594 8418.7
- diab_lft  2      2321 430915 8422.7
- bmi       1      3451 432045 8428.6
- male      1      6072 434666 8437.5
- age_cat   1     51055 479649 8583.5

Step:  AIC=8415.84
rr_sys ~ bmi + male + age_cat + diab_lft + smokstat

           Df Sum of Sq    RSS    AIC
- smokstat  2       340 429274 8413.0
<none>                  428934 8415.8
- diab_lft  2      2337 431271 8419.9
- bmi       1      3490 432424 8425.9
- male      1      5799 434733 8433.7
- age_cat   1     51243 480177 8581.1

Step:  AIC=8413.02
rr_sys ~ bmi + male + age_cat + diab_lft

           Df Sum of Sq    RSS    AIC
<none>                  429274 8413.0
- diab_lft  2      2376 431650 8417.2
- bmi       1      3294 432568 8422.3
- male      1      5765 435039 8430.8
- age_cat   1     52724 481997 8582.7

summary(mod2)


Call:
lm(formula = rr_sys ~ bmi + male + age_cat + diab_lft, data = complete.data)

Residuals:
    Min      1Q  Median      3Q     Max 
-54.225 -10.725  -1.598   8.447  92.175 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     115.46197    0.88013 131.187  < 2e-16 ***
bmi               0.23533    0.06992   3.365 0.000784 ***
maleTRUE          4.14117    0.93013   4.452 9.14e-06 ***
age_cat[50,100)  12.43849    0.92382  13.464  < 2e-16 ***
diab_lft2        14.47054    6.99737   2.068 0.038814 *  
diab_lft3         2.72325    1.31599   2.069 0.038686 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 17.05 on 1476 degrees of freedom
Multiple R-squared:  0.148, Adjusted R-squared:  0.1451 
F-statistic: 51.26 on 5 and 1476 DF,  p-value: < 2.2e-16

# another potential stepwise selection
mod3 <- lm(formula = rr_sys ~  bmi + male + age_cat + diab_lft + smokstat, data = complete.data)
extractAIC(mod3)

[1]    8.000 8415.844