Exercise 5: Solutions second part

Load the data in R.

data <- read.csv("NHANES1.csv")

# make factors
integer_info <- sapply(data, is.integer)
integer_info[which(names(integer_info) == "age")] <- FALSE  # age should stay an integer
data[integer_info] <- lapply(data[integer_info], as.factor)

# make age category
age.cat <- rep(0, nrow(data))  # assign new variable to the dataframe
age.cat[data$age>=18 & data$age<=34] <- 1  # fill it with values
age.cat[data$age>=35 & data$age<=49] <- 2
age.cat[data$age>=50 & data$age<=64] <- 3
age.cat[data$age>=65 & data$age<=79] <- 4
age.cat[data$age>=80] <- 5
age.cat <- as.factor(age.cat)
data$age.cat <- age.cat
# alternatively use cut() as before. something like age.cat <- cut(data$age, c(0, 18, 35, 50, 65, 80, 100))

Task 2.1

Analyze the relationship between lifetime diagnosis of cancer and exposure to pollutants, using the categorized age variable (Note: No information on pollutant exposure was collected from participants aged 80+, so these cannot be included in the analysis). Does the adjustment for age change the picture? Interpret the model coefficients including the intercept.

# attach the data to use them directly without data$...
# however, use this with caution!
attach(data)

The following object is masked _by_ .GlobalEnv:

    age.cat

table(workpollut,cancer_ever)

          cancer_ever
workpollut FALSE TRUE
     FALSE  1899  140
     TRUE   1991  163

prop.test(table(!workpollut,!cancer_ever))


    2-sample test for equality of proportions with continuity correction

data:  table(!workpollut, !cancer_ever)
X-squared = 0.66719, df = 1, p-value = 0.414
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.009124746  0.023148862
sample estimates:
    prop 1     prop 2 
0.07567317 0.06866111

table(workpollut,cancer_ever,age.cat)

, , age.cat = 1

          cancer_ever
workpollut FALSE TRUE
     FALSE   574    5
     TRUE    587    8

, , age.cat = 2

          cancer_ever
workpollut FALSE TRUE
     FALSE   519   21
     TRUE    548   19

, , age.cat = 3

          cancer_ever
workpollut FALSE TRUE
     FALSE   522   48
     TRUE    569   61

, , age.cat = 4

          cancer_ever
workpollut FALSE TRUE
     FALSE   284   66
     TRUE    287   75

, , age.cat = 5

          cancer_ever
workpollut FALSE TRUE
     FALSE     0    0
     TRUE      0    0

prop.test(table(!workpollut[age.cat==1],!cancer_ever[age.cat==1]))


    2-sample test for equality of proportions with continuity correction

data:  table(!workpollut[age.cat == 1], !cancer_ever[age.cat == 1])
X-squared = 0.2585, df = 1, p-value = 0.6112
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.008828858  0.018448457
sample estimates:
     prop 1      prop 2 
0.013445378 0.008635579

prop.test(table(!workpollut[age.cat==2],!cancer_ever[age.cat==2]))


    2-sample test for equality of proportions with continuity correction

data:  table(!workpollut[age.cat == 2], !cancer_ever[age.cat == 2])
X-squared = 0.10129, df = 1, p-value = 0.7503
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.02921675  0.01845837
sample estimates:
    prop 1     prop 2 
0.03350970 0.03888889

prop.test(table(!workpollut[age.cat==3],!cancer_ever[age.cat==3]))


    2-sample test for equality of proportions with continuity correction

data:  table(!workpollut[age.cat == 3], !cancer_ever[age.cat == 3])
X-squared = 0.43401, df = 1, p-value = 0.51
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.02150546  0.04673520
sample estimates:
    prop 1     prop 2 
0.09682540 0.08421053

prop.test(table(!workpollut[age.cat==4],!cancer_ever[age.cat==4]))


    2-sample test for equality of proportions with continuity correction

data:  table(!workpollut[age.cat == 4], !cancer_ever[age.cat == 4])
X-squared = 0.27975, df = 1, p-value = 0.5969
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.04270074  0.07992252
sample estimates:
   prop 1    prop 2 
0.2071823 0.1885714

fisher.test(table(!workpollut,!cancer_ever))


    Fisher's Exact Test for Count Data

data:  table(!workpollut, !cancer_ever)
p-value = 0.4037
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.872556 1.414624
sample estimates:
odds ratio 
  1.110457

fisher.test(table(!workpollut[age.cat==1],!cancer_ever[age.cat==1]))


    Fisher's Exact Test for Count Data

data:  table(!workpollut[age.cat == 1], !cancer_ever[age.cat == 1])
p-value = 0.5794
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.4480324 6.1135675
sample estimates:
odds ratio 
  1.563972

fisher.test(table(!workpollut[age.cat==2],!cancer_ever[age.cat==2]))


    Fisher's Exact Test for Count Data

data:  table(!workpollut[age.cat == 2], !cancer_ever[age.cat == 2])
p-value = 0.7477
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.4303852 1.6961783
sample estimates:
odds ratio 
 0.8570029

fisher.test(table(!workpollut[age.cat==3],!cancer_ever[age.cat==3]))


    Fisher's Exact Test for Count Data

data:  table(!workpollut[age.cat == 3], !cancer_ever[age.cat == 3])
p-value = 0.4821
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.7701114 1.7727133
sample estimates:
odds ratio 
  1.165712

fisher.test(table(!workpollut[age.cat==4],!cancer_ever[age.cat==4]))


    Fisher's Exact Test for Count Data

data:  table(!workpollut[age.cat == 4], !cancer_ever[age.cat == 4])
p-value = 0.5729
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.7643783 1.6563678
sample estimates:
odds ratio 
  1.124299

mod1 <- glm(cancer_ever ~ workpollut, family = binomial(logit))
summary(mod1)


Call:
glm(formula = cancer_ever ~ workpollut, family = binomial(logit))

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.60744    0.08758 -29.774   <2e-16 ***
workpollutTRUE  0.10480    0.11961   0.876    0.381    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2175.8  on 4192  degrees of freedom
Residual deviance: 2175.0  on 4191  degrees of freedom
  (807 observations deleted due to missingness)
AIC: 2179

Number of Fisher Scoring iterations: 5

mod2 <- glm(cancer_ever ~ workpollut + age.cat, family = binomial(logit))
summary(mod2)


Call:
glm(formula = cancer_ever ~ workpollut + age.cat, family = binomial(logit))

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -4.5476     0.2866 -15.869  < 2e-16 ***
workpollutTRUE   0.1067     0.1235   0.864 0.387689    
age.cat2         1.2079     0.3221   3.750 0.000177 ***
age.cat3         2.1869     0.2964   7.377 1.62e-13 ***
age.cat4         3.0938     0.2943  10.511  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2175.8  on 4192  degrees of freedom
Residual deviance: 1925.8  on 4188  degrees of freedom
  (807 observations deleted due to missingness)
AIC: 1935.8

Number of Fisher Scoring iterations: 7

Task 2.2

Try to find a good model of cancer diagnosis, describe and interpret it as you did for systolic blood pressure.

sub.data <- data[ ,c('cancer_ever', 'cd', 'pb', 'hg', 'drnkprd_prv12mo', 'cigsprd_prv30d', 'bmi', 'male', 'age.cat', 'diab_lft', 'rdyfood_prvmo')]
complete.data <- sub.data[complete.cases(sub.data), ]

mod3 <- glm(cancer_ever ~ cd + pb + hg + drnkprd_prv12mo + cigsprd_prv30d + bmi + male + age.cat + as.factor(diab_lft) + rdyfood_prvmo, family = binomial(logit), data = complete.data)

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

library(MASS)
mod4 <- stepAIC(mod3)

Start:  AIC=283.76
cancer_ever ~ cd + pb + hg + drnkprd_prv12mo + cigsprd_prv30d + 
    bmi + male + age.cat + as.factor(diab_lft) + rdyfood_prvmo

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

                      Df Deviance    AIC
- cigsprd_prv30d      25   160.39 250.39
- rdyfood_prvmo       16   152.20 260.20
- drnkprd_prv12mo     16   153.02 261.02
- as.factor(diab_lft)  2   143.93 279.93
- hg                   1   143.98 281.98
- cd                   1   144.17 282.17
- pb                   1   144.44 282.44
- bmi                  1   145.10 283.10
<none>                     143.76 283.76
- male                 1   146.11 284.11
- age.cat              4   165.18 297.18

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred


Step:  AIC=250.39
cancer_ever ~ cd + pb + hg + drnkprd_prv12mo + bmi + male + age.cat + 
    as.factor(diab_lft) + rdyfood_prvmo

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

                      Df Deviance    AIC
- drnkprd_prv12mo     16   170.95 228.95
- rdyfood_prvmo       16   171.31 229.31
- as.factor(diab_lft)  2   161.00 247.00
- cd                   1   160.39 248.39
- hg                   1   160.60 248.60
- bmi                  1   161.07 249.07
- pb                   1   161.82 249.82
<none>                     160.39 250.39
- male                 1   163.55 251.55
- age.cat              4   186.60 268.60

Step:  AIC=228.95
cancer_ever ~ cd + pb + hg + bmi + male + age.cat + as.factor(diab_lft) + 
    rdyfood_prvmo

                      Df Deviance    AIC
- rdyfood_prvmo       17   183.10 207.10
- as.factor(diab_lft)  2   171.58 225.58
- cd                   1   170.95 226.95
- hg                   1   170.97 226.97
- bmi                  1   171.61 227.61
- pb                   1   172.33 228.33
<none>                     170.95 228.95
- male                 1   173.70 229.70
- age.cat              4   198.16 248.16

Step:  AIC=207.1
cancer_ever ~ cd + pb + hg + bmi + male + age.cat + as.factor(diab_lft)

                      Df Deviance    AIC
- as.factor(diab_lft)  2   183.80 203.80
- cd                   1   183.11 205.11
- hg                   1   183.12 205.12
- bmi                  1   183.50 205.50
- pb                   1   183.89 205.89
<none>                     183.10 207.10
- male                 1   185.12 207.12
- age.cat              4   209.10 225.10

Step:  AIC=203.8
cancer_ever ~ cd + pb + hg + bmi + male + age.cat

          Df Deviance    AIC
- cd       1   183.81 201.81
- hg       1   183.82 201.82
- bmi      1   184.19 202.19
- pb       1   184.69 202.69
- male     1   185.62 203.62
<none>         183.80 203.80
- age.cat  4   212.42 224.42

Step:  AIC=201.81
cancer_ever ~ pb + hg + bmi + male + age.cat

          Df Deviance    AIC
- hg       1   183.83 199.83
- bmi      1   184.19 200.19
- pb       1   184.73 200.73
- male     1   185.63 201.63
<none>         183.81 201.81
- age.cat  4   212.45 222.45

Step:  AIC=199.83
cancer_ever ~ pb + bmi + male + age.cat

          Df Deviance    AIC
- bmi      1   184.21 198.21
- pb       1   184.73 198.73
- male     1   185.63 199.63
<none>         183.83 199.83
- age.cat  4   212.65 220.65

Step:  AIC=198.21
cancer_ever ~ pb + male + age.cat

          Df Deviance    AIC
- pb       1   184.96 196.96
- male     1   186.00 198.00
<none>         184.21 198.21
- age.cat  4   214.06 220.06

Step:  AIC=196.96
cancer_ever ~ male + age.cat

          Df Deviance    AIC
<none>         184.96 196.96
- male     1   187.17 197.17
- age.cat  4   214.34 218.34

summary(mod4)


Call:
glm(formula = cancer_ever ~ male + age.cat, family = binomial(logit), 
    data = complete.data)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -4.2230     0.7419  -5.692 1.25e-08 ***
maleTRUE     -0.6124     0.4088  -1.498 0.134107    
age.cat2      0.2197     1.0064   0.218 0.827192    
age.cat3      2.4914     0.7565   3.293 0.000991 ***
age.cat4      2.6475     0.8356   3.168 0.001534 ** 
age.cat5    -10.7306   840.2745  -0.013 0.989811    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 216.41  on 559  degrees of freedom
Residual deviance: 184.96  on 554  degrees of freedom
AIC: 196.96

Number of Fisher Scoring iterations: 14

# explanation
# only intercept
mod5 <- glm(cancer_ever ~ 1, data = data, family=binomial)
summary(mod5)


Call:
glm(formula = cancer_ever ~ 1, family = binomial, data = data)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.33940    0.05134  -45.57   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2817.2  on 4731  degrees of freedom
Residual deviance: 2817.2  on 4731  degrees of freedom
  (268 observations deleted due to missingness)
AIC: 2819.2

Number of Fisher Scoring iterations: 5

(tab <- table(data$cancer_ever))


FALSE  TRUE 
 4316   416

tab["TRUE"]/(tab["TRUE"]+tab["FALSE"])

      TRUE 
0.08791209

exp(mod5$coefficients["(Intercept)"])/(1+exp(mod5$coefficients["(Intercept)"]))

(Intercept) 
 0.08791209

# with a predictor
mod6 <- glm(cancer_ever~workpollut, data = data, family = binomial(logit))
summary(mod6)


Call:
glm(formula = cancer_ever ~ workpollut, family = binomial(logit), 
    data = data)

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.60744    0.08758 -29.774   <2e-16 ***
workpollutTRUE  0.10480    0.11961   0.876    0.381    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2175.8  on 4192  degrees of freedom
Residual deviance: 2175.0  on 4191  degrees of freedom
  (807 observations deleted due to missingness)
AIC: 2179

Number of Fisher Scoring iterations: 5

(tab<-table(data$workpollut, data$cancer_ever))

       
        FALSE TRUE
  FALSE  1899  140
  TRUE   1991  163

#For people not exposed to work pollution
p.ne <- tab["FALSE","TRUE"]/(tab["FALSE","TRUE"]+tab["FALSE","FALSE"])
exp(mod6$coefficients["(Intercept)"])/(1+exp(mod6$coefficients["(Intercept)"]))

(Intercept) 
 0.06866111

#For people exposed to work pollution
p.e <- tab["TRUE","TRUE"]/(tab["TRUE","TRUE"]+tab["TRUE","FALSE"])
exp(mod6$coefficients["(Intercept)"]+mod6$coefficients["workpollutTRUE"])/(1+exp(mod6$coefficients["(Intercept)"]+mod6$coefficients["workpollutTRUE"]))

(Intercept) 
 0.07567317

# log-odd baseline (no workpollution)
log(p.ne/(1-p.ne))

[1] -2.60744

mod6$coefficients["(Intercept)"]

(Intercept) 
   -2.60744

# log-odds workpollution
log(p.e/(1-p.e))

[1] -2.502642

mod6$coefficients["(Intercept)"]+mod6$coefficients["workpollutTRUE"]

(Intercept) 
  -2.502642

# log odds-ratio
log((p.e/(1-p.e))/(p.ne/(1-p.ne)))

[1] 0.1047982

summary(mod6)


Call:
glm(formula = cancer_ever ~ workpollut, family = binomial(logit), 
    data = data)

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -2.60744    0.08758 -29.774   <2e-16 ***
workpollutTRUE  0.10480    0.11961   0.876    0.381    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2175.8  on 4192  degrees of freedom
Residual deviance: 2175.0  on 4191  degrees of freedom
  (807 observations deleted due to missingness)
AIC: 2179

Number of Fisher Scoring iterations: 5