-
Notifications
You must be signed in to change notification settings - Fork 0
/
ch15.R
236 lines (185 loc) · 9 KB
/
ch15.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
library(ISwR)
# Epidemiology studies calculate rates, of death or incidence rates of a chronic
# or acute disease. Based on counting the events occuring in a certain time-frame
# Poisson regrssion used to analyze rate data. Data that are amounts of time until
# an event can be analyzed by a similair technique
# 15.1 Basic idea
# 2 forms: aggregate of observed count x based on number of person-years T.
# (approx with tables of population size), and with grouping, which we may want
# to compare differences in rates. Individual level data, in which each subject has
# a time under observation Ti and an indicator 0 or 1 xi if the subject had the
# event. x = Sum(xi) T = Sum(Ti), where the sums are over all individuals in the
# group.
#15.1.1 Poisson Distribution
# Poisson dist can be described as the limiting case of the binomial distrbutions
# when size parameter N increases while the expected number of successes l = Np is
# fixed. Used to describe rare events in large populations (like incomeinequality)
# dist of point prob: f(x) = (l^x / x!) * e^-l where x = 0,1,...
# dist is unbounded, but probability of large x is low. dpois, ppois for this.
# Epidemology: interesting value is counts per observed timespan as x.
# (rate of events) can compare different sizes and differnt time lengths
# parameterize the dist with rho = l / T
# Possion Likelihood: fit models with maximum likelihood. If we parameterize by
# rho, the log-likelihood becomes: L(rho) = constant + x log rho - rhoT
# which is maximized when rho = l/T.
#15.1.2 Survival analysis with constant hazard
# mortality study terms. but could be other events.
# individual data is basically survival data like in ch14, but with changes in
# notation. Rate analysis assumes that hazard does not change over time, or at
# least not abruptly. rates obtained over short time period, and origin of
# timescale usually not keyed to disease onset or major surgery. if constant
# hazard, then dist of lifetime is an exponential dist with density rho*e^-l and
# survival fxn e^-l
# Likelihood analysis
# censor data likelihood can be buit with terms or probability dist at time of
# death or survival prob in case of censoring. W constant hazard, only 2 factors
# differ with rho, which can be encoded into xi, so the log likelihood terms become
# L(rho) = xi*log(rho) - rho*Ti
# Except for the constant, this is the same as poisson. xi = 1 if dead, 0 if
# censoring only. this trick allows survial data to be analyzed by poisson.
# Can be extended out to hazards that are peicewise constant or about constant for
# the viewing period. break lifetime into Ti(1), Ti(2)...
# where hazard is constant in each section.
# likelihood term is: l(rho1, rho2, ..rhoi)= Sum j = 1 to k (xi(j) log (rhoj)-rhoj*Ti(j))
# k-1 = j will have xi = 0. only the last is 1.
# Baiscally the same as if it was k other individuals who were censored.
# This is the rationale behind time splitting techniques where one observation is
# broken into observations of multiple pseudoindividuals.
# *note that you cant pretend that the data comes from a poisson dist. Difference
# is to what extent the random variation lies in the counts or the amount of time.
# When data is frequently censored, (event is rare) the survival model approximates
# poisson
#15.2 Fitting Poisson models
# glm also includes poisson, which uses log link by default. fromulates models in
# terms of log rho = b0 + b1x1 + ... + bkxk + log T
# (since glm needs a model for the expected counts rather than rates), log T
# needed as an offset
print(names(eba1977))
attach(eba1977)
#lung cancer by age in 4 danish cities in 1977
# fit a model with multiplicative effects of age and city on the rate of lung cancer.
fit <- glm(cases~city+age+offset(log(pop)), family = poisson)
print(summary(fit))
#coefficents indicate differences in log of rate ratio between city of
#fredericia and 50-54 yo. intercept gives the log rate ratio of 50-54 yo in fredericia
#goodness of fit stat given by comparison of residual variance with chisq dist
#on the stated deg of freedom. Considered valid if expected count in all cells is greater than 5.
print(min(fitted(fit)))
print(pchisq(deviance(fit), df.residual(fit), lower=F))
#if pchisq was less than 0.05, the model would not be fititng the data well.
#There is an age effect, but what about a city effect? DO chisq test with drop1,
#on each term and see how the deviance changes.
print(drop1(fit, test='Chisq'))
#city not significant, but it was in the model. Maybe a paricular city has a higher propensity.
fit2 <- glm(cases~(city=="Fredericia")+age, offset = log(pop), family = poisson)
print(anova(fit, fit2, test="Chisq"))
#anova not significantly differnet?
print(drop1(fit2, test="Chisq"))
#just fredericia is sig
print(summary(fit2))
#fit 2 is all significant
cf <- coefficients(summary(fit2))
est <- cf[,1]
s.e. <- cf[,2]
rr <- exp(cbind(est, est-s.e.*qnorm(.975), est + s.e.*qnorm(.975)))
colnames(rr) <- c("RateRatio", "CI.lo", "CI.hi")
print(rr)
print(exp(cbind(coef(fit2), confint(fit2))))
detach()
#15.3 Computing rates
head(nickel.expand)
print(subset(nickel.expand, id==325))
nickel.expand <- within(nickel.expand, lung.cancer <- as.numeric(icd %in% c(162,163)))
attach(nickel.expand)
pyr <- tapply(ageout-agein, list(ygr,agr), sum)
print(round(pyr), na.print = "-")
count <- tapply(lung.cancer, list(ygr, agr), sum)
print(count, na.print="-")
print(round(count/pyr*1000, 1), na.print="-")
#compute expected counts in each cell based on the standard mortality table
#and then compare to the actual counts
expect.count <- tapply(lung/1e6*(ageout-agein), list(ygr, agr), sum)
print(round(expect.count, 1), na.print="-")
#observed counts are larger than expected so we need to scale them to the overall standard mortality rate.
#gotten from ratio of total cases to expected number of cases:family
expect.tot <- sum(lung/1e6*(ageout-agein))
print(expect.tot)
count.tot <- sum(lung.cancer)
print(count.tot)
print(count.tot/expect.tot)
#This dataset has 6 times as many cancer deaths as the general population.
#15.4 Models with piecewise constant intensities
#Can formulate SMR as a Poisson regression model.
#SMR asssumption is that there is a constant rate ratio to standard mortality,
#so we can fit a model with only an intercept while having an offset,
#Which is the log of the expected count. Similar to modeling rates. Pop mortality pi
#is just absorbed into the offset log pi + log Ti = log pi Ti.
fit <- glm(lung.cancer ~ 1, poisson, offset = log((ageout-agein)*lung/1e6))
print(summary(fit))
#model based on individual data, dependent variable is 0 or 1.
# could make model based on cross classification of agr, and ygr, but you
#couldnt check covariates like age at first exposure
#cant use the deviances for model checking both because the expected counts per
#cell are very small
print(exp(coef(fit)))
#We can analyze the data more thoroughly using regression methods.
#check is smr stays constant over year and age groups using a multiplicative poisson model
#this requires simplified groupings because some of the groups contain very few cases.
print(tapply(lung.cancer, agr, sum))
print(tapply(lung.cancer, ygr, sum))
#need at least 10 cases per level
detach()
nickel.expand <-within(nickel.expand,{
A <- factor(agr)
Y <- factor(ygr)
lv <- levels(A)
lv[1:6] <- "<50"
lv[11:13] <- "70+"
levels(A) <- lv
lv <- levels(Y)
lv[7:10] <- "1961ff"
levels(Y) <- lv
rm(lv)
})
attach(nickel.expand)
fit <- glm(lung.cancer ~ A + Y, poisson, offset = log((ageout-agein)*lung/1e6))
print(drop1(fit, test="Chisq"))
#Dont need age groups
fit <- glm(lung.cancer ~ Y - 1, poisson, offset = log((ageout-agein)*lung/1e6))
print(summary(fit))
print(round(exp(coef(fit)),1))
expect.count <- tapply(lung/1e6*(ageout-agein), Y, sum)
count <- tapply(lung.cancer, Y, sum)
print(cbind(count=count, expect=round(expect.count,1), SMR=round(count/expect.count, 1)))
detach()
nickel.expand <- within(nickel.expand, {
TFE <- cut(agein-age1st, c(0, 20, 30, 40, 50, 100), right=F)
AFE <- cut(age1st, c(0, 20, 27.5, 35, 100), right=F)
YFE <- cut(dob + age1st, c(0, 1910, 1915, 1920, 1925), right=F)
EXP <- cut(exposure, c(0, 0.5, 4.5, 8.5, 12.5, 25), right=F)
})
attach(nickel.expand)
fit <- glm(lung.cancer ~ TFE + AFE + YFE + EXP, poisson, offset = log((ageout-agein)*lung/1e6))
print(drop1(fit, test="Chisq"))
detach()
#Ex.1
bcmort21 <- within(bcmort, {
period <- area <- cohort
levels(period) <- rep(c("1991-2001", "1981-1991"), each=2)
levels(area) <- rep(c("Cph+Frb", "Nat"), 2)
})
attach(bcmort21)
fit <- glm(bc.deaths ~ (age + period + area)^2, poisson, offset = log(p.yr), data=bcmort21)
print(summary(fit))
print(drop1(fit, test="Chisq"))
print(confint(fit, parm="period1981-1991:areaNat"))
detach()
#Ex 2
stroke.trim <- function(t1, t2) subset(transform(stroke,
entry=t1, exit=pmin(t2, obstime),
dead=dead & obstime <=t2),
entry < exit)
stroke2 <- do.call(rbind, mapply(stroke.trim, c(0,0.5,2, 12), c(0.5, 2, 12, Inf), SIMPLIFY = F))
print(table(stroke$dead))
print(table(stroke2$dead))
print