########################################################### # # # ISUP (J.-F. Dupuy) # # # ########################################################### library(MASS) library(AER) library(vcdExtra) data("NMES1988") # help(NMES1988) # head(NMES1988) # dim(NMES1988) ## select variables for analysis nmes = NMES1988[, c(1, 6:8, 13, 15, 18)] ## barplots showing the frequency distribution of the number of physician office visits par(mfrow = c(1,2)) visits.fac = factor(nmes$visits, levels=0:max(nmes$visits)) # include zero frequencies visits.tab = table(visits.fac) barplot(visits.tab, xlab="Number of physician office visits", ylab="Frequency",col="lightblue") abline(v=mean(nmes$visits), col="red", lwd=3) ci = mean(nmes$visits)+c(-1,1)*sd(nmes$visits) lines(x=ci, y=c(-4, -4), col="red", lwd=3, xpd=TRUE) plot(table(nmes$visits), xlab="Number of physician office visits", ylab="Frequency") ## some exploratory plots for bivariate visualization clog = function(x) log(x + 0.5) # a convenient transformation for exploratory graphics dev.new() par(mfrow = c(1,2)) plot(visits ~ chronic, data = nmes) plot(clog(visits) ~ cutfac(chronic), data = nmes) par(mfrow = c(3, 2)) plot(clog(visits) ~ health, data = nmes, varwidth = TRUE,xlab="Self-perceived health status", ylab="Physician office visits (in clogs)", main="health") plot(clog(visits) ~ cutfac(chronic), data = nmes,xlab="Number of chronic conditions", ylab="Physician office visits (in clogs)", main="chronic") plot(clog(visits) ~ insurance, data = nmes, varwidth = TRUE,xlab="Covered by private insurance", ylab="Physician office visits (in clogs)", main="insurance") plot(clog(visits) ~ cutfac(hospital, c(0:2, 8)), data = nmes,xlab="Number of hospital stays", ylab="Physician office visits (in clogs)", main="hospital") plot(clog(visits) ~ gender, data = nmes, varwidth = TRUE,xlab="Gender", ylab="Physician office visits (in clogs)", main="gender") plot(cutfac(visits, c(0:2, 4, 6, 10, 100)) ~ school, data = nmes, breaks = 9,xlab="Number of years of education", ylab="Physician office visits (number of)", main="school") par(mfrow = c(1, 1)) ############################################################ ######## use only if package vcdExtra not available ######## ############################################################ ## Some exploratory plots for bivariate visualization cfac = function(x, breaks = NULL) { if(is.null(breaks)) breaks = unique(quantile(x, 0:10/10)) x = cut(x, breaks, include.lowest = TRUE, right = FALSE) levels(x) = paste(breaks[-length(breaks)], ifelse(diff(breaks) > 1, c(paste("-", breaks[-c(1, length(breaks))] - 1, sep = ""), "+"), ""), sep = "") return(x) } # for transforming a count variable to a factor (for visualization purposes only) par(mfrow = c(3, 2)) plot(clog(visits) ~ health, data = nmes, varwidth = TRUE,xlab="Self-perceived health status", ylab="Physician office visits (in clogs)", main="health") plot(clog(visits) ~ cfac(chronic), data = nmes,xlab="Number of chronic conditions", ylab="Physician office visits (in clogs)", main="chronic") plot(clog(visits) ~ insurance, data = nmes, varwidth = TRUE,xlab="Covered by private insurance", ylab="Physician office visits (in clogs)", main="insurance") plot(clog(visits) ~ cfac(hospital, c(0:2, 8)), data = nmes,xlab="Number of hospital stays", ylab="Physician office visits (in clogs)", main="hospital") plot(clog(visits) ~ gender, data = nmes, varwidth = TRUE,xlab="Gender", ylab="Physician office visits (in clogs)", main="gender") plot(cfac(visits, c(0:2, 4, 6, 10, 100)) ~ school, data = nmes, breaks = 9,xlab="Number of years of education", ylab="Physician office visits (number of)", main="school") par(mfrow = c(1, 1)) ############################################################ #### end of use only if package vcdExtra not available ##### ############################################################ ######################### ## Poisson regression ######################### with(nmes, c(mean=mean(visits), var=var(visits), ratio=var(visits)/mean(visits))) nmes_pois = glm(visits ~ ., data = nmes, family = poisson) summary(nmes_pois) round(cbind(beta=coef(nmes_pois), expbeta=exp(coef(nmes_pois)), pct=100*(exp(coef(nmes_pois))-1)),3) pr=residuals(nmes_pois,"pearson") phi=sum(pr^2)/df.residual(nmes_pois) ######################### ## LM test for overdispersion ######################### # help(dispersiontest) dispersiontest(nmes_pois) dispersiontest(nmes_pois, trafo = 2) ######################### ## quasipoisson model ######################### nmes_qpois = glm(visits ~ ., data = nmes, family = quasipoisson) phi = summary(nmes_qpois)$dispersion phi ######################### ## NB regression ######################### nmes_nb = glm.nb(visits ~ ., data = nmes) round(cbind(p=coef(nmes_pois),qp=coef(nmes_qpois),NB=coef(nmes_nb),se.p=sqrt(diag(vcov(nmes_pois))),se.qp=sqrt(diag(vcov(nmes_qpois))),se.NB=sqrt(diag(vcov(nmes_nb)))),4) round(sqrt(diag(vcov(nmes_pois)))*sqrt(phi),4) xb=fitted(nmes_nb, type="response") # can be fitted with nmes_pois and nmes_qpois, which both provide same picture (close to the one obtained with nmes_nb) # xb = predict(nmes_nb) g = cut(xb, breaks=quantile(xb,seq(0,1,by=0.05))) m = tapply(nmes$visits, g, mean) v = tapply(nmes$visits, g, var) cbind(m,v) plot(m, v, xlab="Mean", ylab="Variance", main="Mean-Variance relationship",pch=16) mtext("NMES data",padj=-0.5) x = seq(min(m)-0.5,max(m)+0.5,0.01) lines(x, x*phi, lty="dashed",col="red",lwd=2) lines(x, x*(1+x/nmes_nb$theta),col="blue",lwd=2) legend("topleft", lty=c("dashed","solid"), col=c("red","blue"), lwd=c(2,2), legend=c("Q. Poisson","Neg. Binom."), inset=0.05) ######################### ## NB regression WITH FIXED theta ######################### # For a fixed value of theta, the NB is a special case of GLM and can be fitted with glm(). # If theta is unknown, the NB model is not a special case of the GLM and is fitted using glm.nb(). nmes_nb_fixedT=glm(visits ~ ., data = nmes, family=negative.binomial(1)) summary(nmes_nb_fixedT) # for very large theta, coefficient estimates are close to Poisson nmes_pois = glm(visits ~ ., data = nmes, family = poisson) nmes_nb_fixedT=glm(visits ~ ., data = nmes, family=negative.binomial(1000)) round(cbind(p=coef(nmes_pois),qp=coef(nmes_nb_fixedT)),4) ######################### ## ZIP regression ######################### library(pscl) nmes_zip0 = zeroinfl(visits ~ . |., data = nmes, dist = "poisson") summary(nmes_zip0) nmes_zip = zeroinfl(visits ~ . | hospital + chronic + insurance + school + gender, data = nmes, dist = "poisson") summary(nmes_zip) waldtest(nmes_zip0, nmes_zip) lrtest(nmes_zip0, nmes_zip) ## compare estimated coefficients for count models fm = list("ML-Pois" = nmes_pois, "Quasi-Pois" = nmes_qpois, "NB" = nmes_nb, "ZIP" = nmes_zip) round(sapply(fm, function(x) coef(x)[1:8]), digits = 3) ## associated standard errors round(cbind(sapply(fm, function(x) sqrt(diag(vcov(x)))[1:8])), digits = 3) round(cbind("ML-Pois" = sqrt(diag(vcov(nmes_pois))), sapply(fm[-1], function(x) sqrt(diag(vcov(x)))[1:8])), digits = 3) ## log-likelihoods and number of estimated parameters rbind(logLik = sapply(fm, function(x) round(logLik(x))), Df = sapply(fm, function(x) attr(logLik(x), "df"))) ## predicted number of zeros round(c("Obs" = sum(nmes$visits == 0), "ML-Pois" = sum(dpois(0, fitted(nmes_pois))), "Quasi-Pois" = NA, "NB" = sum(dnbinom(0, mu = fitted(nmes_nb), size = nmes_nb$theta)), "ZIP" = sum(predict(nmes_zip, type = "prob")[,1])))