data <- read.csv("NHANES1.csv")
# make factors
integer_info <- sapply(data, is.integer)
integer_info[which(names(integer_info) == "age")] <- FALSE # age should stay an integer
data[integer_info] <- lapply(data[integer_info], as.factor)
# make age category
age.cat <- rep(0, nrow(data)) # assign new variable to the dataframe
age.cat[data$age>=18 & data$age<=34] <- 1 # fill it with values
age.cat[data$age>=35 & data$age<=49] <- 2
age.cat[data$age>=50 & data$age<=64] <- 3
age.cat[data$age>=65 & data$age<=79] <- 4
age.cat[data$age>=80] <- 5
age.cat <- as.factor(age.cat)
data$age.cat <- age.cat
# alternatively use cut() as before. something like age.cat <- cut(data$age, c(0, 18, 35, 50, 65, 80, 100))Exercise 5: Solutions second part
Load the data in R.
Task 2.1
Analyze the relationship between lifetime diagnosis of cancer and exposure to pollutants, using the categorized age variable (Note: No information on pollutant exposure was collected from participants aged 80+, so these cannot be included in the analysis). Does the adjustment for age change the picture? Interpret the model coefficients including the intercept.
# attach the data to use them directly without data$...
# however, use this with caution!
attach(data)The following object is masked _by_ .GlobalEnv:
age.cat
table(workpollut,cancer_ever) cancer_ever
workpollut FALSE TRUE
FALSE 1899 140
TRUE 1991 163
prop.test(table(!workpollut,!cancer_ever))
2-sample test for equality of proportions with continuity correction
data: table(!workpollut, !cancer_ever)
X-squared = 0.66719, df = 1, p-value = 0.414
alternative hypothesis: two.sided
95 percent confidence interval:
-0.009124746 0.023148862
sample estimates:
prop 1 prop 2
0.07567317 0.06866111
table(workpollut,cancer_ever,age.cat), , age.cat = 1
cancer_ever
workpollut FALSE TRUE
FALSE 574 5
TRUE 587 8
, , age.cat = 2
cancer_ever
workpollut FALSE TRUE
FALSE 519 21
TRUE 548 19
, , age.cat = 3
cancer_ever
workpollut FALSE TRUE
FALSE 522 48
TRUE 569 61
, , age.cat = 4
cancer_ever
workpollut FALSE TRUE
FALSE 284 66
TRUE 287 75
, , age.cat = 5
cancer_ever
workpollut FALSE TRUE
FALSE 0 0
TRUE 0 0
prop.test(table(!workpollut[age.cat==1],!cancer_ever[age.cat==1]))
2-sample test for equality of proportions with continuity correction
data: table(!workpollut[age.cat == 1], !cancer_ever[age.cat == 1])
X-squared = 0.2585, df = 1, p-value = 0.6112
alternative hypothesis: two.sided
95 percent confidence interval:
-0.008828858 0.018448457
sample estimates:
prop 1 prop 2
0.013445378 0.008635579
prop.test(table(!workpollut[age.cat==2],!cancer_ever[age.cat==2]))
2-sample test for equality of proportions with continuity correction
data: table(!workpollut[age.cat == 2], !cancer_ever[age.cat == 2])
X-squared = 0.10129, df = 1, p-value = 0.7503
alternative hypothesis: two.sided
95 percent confidence interval:
-0.02921675 0.01845837
sample estimates:
prop 1 prop 2
0.03350970 0.03888889
prop.test(table(!workpollut[age.cat==3],!cancer_ever[age.cat==3]))
2-sample test for equality of proportions with continuity correction
data: table(!workpollut[age.cat == 3], !cancer_ever[age.cat == 3])
X-squared = 0.43401, df = 1, p-value = 0.51
alternative hypothesis: two.sided
95 percent confidence interval:
-0.02150546 0.04673520
sample estimates:
prop 1 prop 2
0.09682540 0.08421053
prop.test(table(!workpollut[age.cat==4],!cancer_ever[age.cat==4]))
2-sample test for equality of proportions with continuity correction
data: table(!workpollut[age.cat == 4], !cancer_ever[age.cat == 4])
X-squared = 0.27975, df = 1, p-value = 0.5969
alternative hypothesis: two.sided
95 percent confidence interval:
-0.04270074 0.07992252
sample estimates:
prop 1 prop 2
0.2071823 0.1885714
fisher.test(table(!workpollut,!cancer_ever))
Fisher's Exact Test for Count Data
data: table(!workpollut, !cancer_ever)
p-value = 0.4037
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.872556 1.414624
sample estimates:
odds ratio
1.110457
fisher.test(table(!workpollut[age.cat==1],!cancer_ever[age.cat==1]))
Fisher's Exact Test for Count Data
data: table(!workpollut[age.cat == 1], !cancer_ever[age.cat == 1])
p-value = 0.5794
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.4480324 6.1135675
sample estimates:
odds ratio
1.563972
fisher.test(table(!workpollut[age.cat==2],!cancer_ever[age.cat==2]))
Fisher's Exact Test for Count Data
data: table(!workpollut[age.cat == 2], !cancer_ever[age.cat == 2])
p-value = 0.7477
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.4303852 1.6961783
sample estimates:
odds ratio
0.8570029
fisher.test(table(!workpollut[age.cat==3],!cancer_ever[age.cat==3]))
Fisher's Exact Test for Count Data
data: table(!workpollut[age.cat == 3], !cancer_ever[age.cat == 3])
p-value = 0.4821
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.7701114 1.7727133
sample estimates:
odds ratio
1.165712
fisher.test(table(!workpollut[age.cat==4],!cancer_ever[age.cat==4]))
Fisher's Exact Test for Count Data
data: table(!workpollut[age.cat == 4], !cancer_ever[age.cat == 4])
p-value = 0.5729
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.7643783 1.6563678
sample estimates:
odds ratio
1.124299
mod1 <- glm(cancer_ever ~ workpollut, family = binomial(logit))
summary(mod1)
Call:
glm(formula = cancer_ever ~ workpollut, family = binomial(logit))
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.60744 0.08758 -29.774 <2e-16 ***
workpollutTRUE 0.10480 0.11961 0.876 0.381
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2175.8 on 4192 degrees of freedom
Residual deviance: 2175.0 on 4191 degrees of freedom
(807 observations deleted due to missingness)
AIC: 2179
Number of Fisher Scoring iterations: 5
mod2 <- glm(cancer_ever ~ workpollut + age.cat, family = binomial(logit))
summary(mod2)
Call:
glm(formula = cancer_ever ~ workpollut + age.cat, family = binomial(logit))
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.5476 0.2866 -15.869 < 2e-16 ***
workpollutTRUE 0.1067 0.1235 0.864 0.387689
age.cat2 1.2079 0.3221 3.750 0.000177 ***
age.cat3 2.1869 0.2964 7.377 1.62e-13 ***
age.cat4 3.0938 0.2943 10.511 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2175.8 on 4192 degrees of freedom
Residual deviance: 1925.8 on 4188 degrees of freedom
(807 observations deleted due to missingness)
AIC: 1935.8
Number of Fisher Scoring iterations: 7
Task 2.2
Try to find a good model of cancer diagnosis, describe and interpret it as you did for systolic blood pressure.
sub.data <- data[ ,c('cancer_ever', 'cd', 'pb', 'hg', 'drnkprd_prv12mo', 'cigsprd_prv30d', 'bmi', 'male', 'age.cat', 'diab_lft', 'rdyfood_prvmo')]
complete.data <- sub.data[complete.cases(sub.data), ]
mod3 <- glm(cancer_ever ~ cd + pb + hg + drnkprd_prv12mo + cigsprd_prv30d + bmi + male + age.cat + as.factor(diab_lft) + rdyfood_prvmo, family = binomial(logit), data = complete.data)Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
library(MASS)
mod4 <- stepAIC(mod3)Start: AIC=283.76
cancer_ever ~ cd + pb + hg + drnkprd_prv12mo + cigsprd_prv30d +
bmi + male + age.cat + as.factor(diab_lft) + rdyfood_prvmo
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- cigsprd_prv30d 25 160.39 250.39
- rdyfood_prvmo 16 152.20 260.20
- drnkprd_prv12mo 16 153.02 261.02
- as.factor(diab_lft) 2 143.93 279.93
- hg 1 143.98 281.98
- cd 1 144.17 282.17
- pb 1 144.44 282.44
- bmi 1 145.10 283.10
<none> 143.76 283.76
- male 1 146.11 284.11
- age.cat 4 165.18 297.18
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=250.39
cancer_ever ~ cd + pb + hg + drnkprd_prv12mo + bmi + male + age.cat +
as.factor(diab_lft) + rdyfood_prvmo
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- drnkprd_prv12mo 16 170.95 228.95
- rdyfood_prvmo 16 171.31 229.31
- as.factor(diab_lft) 2 161.00 247.00
- cd 1 160.39 248.39
- hg 1 160.60 248.60
- bmi 1 161.07 249.07
- pb 1 161.82 249.82
<none> 160.39 250.39
- male 1 163.55 251.55
- age.cat 4 186.60 268.60
Step: AIC=228.95
cancer_ever ~ cd + pb + hg + bmi + male + age.cat + as.factor(diab_lft) +
rdyfood_prvmo
Df Deviance AIC
- rdyfood_prvmo 17 183.10 207.10
- as.factor(diab_lft) 2 171.58 225.58
- cd 1 170.95 226.95
- hg 1 170.97 226.97
- bmi 1 171.61 227.61
- pb 1 172.33 228.33
<none> 170.95 228.95
- male 1 173.70 229.70
- age.cat 4 198.16 248.16
Step: AIC=207.1
cancer_ever ~ cd + pb + hg + bmi + male + age.cat + as.factor(diab_lft)
Df Deviance AIC
- as.factor(diab_lft) 2 183.80 203.80
- cd 1 183.11 205.11
- hg 1 183.12 205.12
- bmi 1 183.50 205.50
- pb 1 183.89 205.89
<none> 183.10 207.10
- male 1 185.12 207.12
- age.cat 4 209.10 225.10
Step: AIC=203.8
cancer_ever ~ cd + pb + hg + bmi + male + age.cat
Df Deviance AIC
- cd 1 183.81 201.81
- hg 1 183.82 201.82
- bmi 1 184.19 202.19
- pb 1 184.69 202.69
- male 1 185.62 203.62
<none> 183.80 203.80
- age.cat 4 212.42 224.42
Step: AIC=201.81
cancer_ever ~ pb + hg + bmi + male + age.cat
Df Deviance AIC
- hg 1 183.83 199.83
- bmi 1 184.19 200.19
- pb 1 184.73 200.73
- male 1 185.63 201.63
<none> 183.81 201.81
- age.cat 4 212.45 222.45
Step: AIC=199.83
cancer_ever ~ pb + bmi + male + age.cat
Df Deviance AIC
- bmi 1 184.21 198.21
- pb 1 184.73 198.73
- male 1 185.63 199.63
<none> 183.83 199.83
- age.cat 4 212.65 220.65
Step: AIC=198.21
cancer_ever ~ pb + male + age.cat
Df Deviance AIC
- pb 1 184.96 196.96
- male 1 186.00 198.00
<none> 184.21 198.21
- age.cat 4 214.06 220.06
Step: AIC=196.96
cancer_ever ~ male + age.cat
Df Deviance AIC
<none> 184.96 196.96
- male 1 187.17 197.17
- age.cat 4 214.34 218.34
summary(mod4)
Call:
glm(formula = cancer_ever ~ male + age.cat, family = binomial(logit),
data = complete.data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.2230 0.7419 -5.692 1.25e-08 ***
maleTRUE -0.6124 0.4088 -1.498 0.134107
age.cat2 0.2197 1.0064 0.218 0.827192
age.cat3 2.4914 0.7565 3.293 0.000991 ***
age.cat4 2.6475 0.8356 3.168 0.001534 **
age.cat5 -10.7306 840.2745 -0.013 0.989811
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 216.41 on 559 degrees of freedom
Residual deviance: 184.96 on 554 degrees of freedom
AIC: 196.96
Number of Fisher Scoring iterations: 14
# explanation
# only intercept
mod5 <- glm(cancer_ever ~ 1, data = data, family=binomial)
summary(mod5)
Call:
glm(formula = cancer_ever ~ 1, family = binomial, data = data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.33940 0.05134 -45.57 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2817.2 on 4731 degrees of freedom
Residual deviance: 2817.2 on 4731 degrees of freedom
(268 observations deleted due to missingness)
AIC: 2819.2
Number of Fisher Scoring iterations: 5
(tab <- table(data$cancer_ever))
FALSE TRUE
4316 416
tab["TRUE"]/(tab["TRUE"]+tab["FALSE"]) TRUE
0.08791209
exp(mod5$coefficients["(Intercept)"])/(1+exp(mod5$coefficients["(Intercept)"]))(Intercept)
0.08791209
# with a predictor
mod6 <- glm(cancer_ever~workpollut, data = data, family = binomial(logit))
summary(mod6)
Call:
glm(formula = cancer_ever ~ workpollut, family = binomial(logit),
data = data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.60744 0.08758 -29.774 <2e-16 ***
workpollutTRUE 0.10480 0.11961 0.876 0.381
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2175.8 on 4192 degrees of freedom
Residual deviance: 2175.0 on 4191 degrees of freedom
(807 observations deleted due to missingness)
AIC: 2179
Number of Fisher Scoring iterations: 5
(tab<-table(data$workpollut, data$cancer_ever))
FALSE TRUE
FALSE 1899 140
TRUE 1991 163
#For people not exposed to work pollution
p.ne <- tab["FALSE","TRUE"]/(tab["FALSE","TRUE"]+tab["FALSE","FALSE"])
exp(mod6$coefficients["(Intercept)"])/(1+exp(mod6$coefficients["(Intercept)"]))(Intercept)
0.06866111
#For people exposed to work pollution
p.e <- tab["TRUE","TRUE"]/(tab["TRUE","TRUE"]+tab["TRUE","FALSE"])
exp(mod6$coefficients["(Intercept)"]+mod6$coefficients["workpollutTRUE"])/(1+exp(mod6$coefficients["(Intercept)"]+mod6$coefficients["workpollutTRUE"]))(Intercept)
0.07567317
# log-odd baseline (no workpollution)
log(p.ne/(1-p.ne))[1] -2.60744
mod6$coefficients["(Intercept)"](Intercept)
-2.60744
# log-odds workpollution
log(p.e/(1-p.e))[1] -2.502642
mod6$coefficients["(Intercept)"]+mod6$coefficients["workpollutTRUE"](Intercept)
-2.502642
# log odds-ratio
log((p.e/(1-p.e))/(p.ne/(1-p.ne)))[1] 0.1047982
summary(mod6)
Call:
glm(formula = cancer_ever ~ workpollut, family = binomial(logit),
data = data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.60744 0.08758 -29.774 <2e-16 ***
workpollutTRUE 0.10480 0.11961 0.876 0.381
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2175.8 on 4192 degrees of freedom
Residual deviance: 2175.0 on 4191 degrees of freedom
(807 observations deleted due to missingness)
AIC: 2179
Number of Fisher Scoring iterations: 5