###########################################################
#                                                         #
#                ISUP   (J.-F. Dupuy)                     #
#                                                         #
###########################################################

library(MASS)
library(AER)
library(vcdExtra)

data("NMES1988")

# help(NMES1988)
# head(NMES1988)
# dim(NMES1988)

## select variables for analysis

nmes = NMES1988[, c(1, 6:8, 13, 15, 18)]

## barplots showing the frequency distribution of the number of physician office visits

par(mfrow = c(1,2))

visits.fac = factor(nmes$visits, levels=0:max(nmes$visits)) # include zero frequencies
visits.tab = table(visits.fac)
barplot(visits.tab, xlab="Number of physician office visits", ylab="Frequency",col="lightblue")
abline(v=mean(nmes$visits), col="red", lwd=3)
ci = mean(nmes$visits)+c(-1,1)*sd(nmes$visits)
lines(x=ci, y=c(-4, -4), col="red", lwd=3, xpd=TRUE)

plot(table(nmes$visits), xlab="Number of physician office visits", ylab="Frequency") 

## some exploratory plots for bivariate visualization

clog = function(x) log(x + 0.5) # a convenient transformation for exploratory graphics

dev.new()

par(mfrow = c(1,2))
plot(visits ~ chronic, data = nmes)
plot(clog(visits) ~ cutfac(chronic), data = nmes)

par(mfrow = c(3, 2))
plot(clog(visits) ~ health, data = nmes, varwidth = TRUE,xlab="Self-perceived health status", ylab="Physician office visits (in clogs)", main="health")
plot(clog(visits) ~ cutfac(chronic), data = nmes,xlab="Number of chronic conditions", ylab="Physician office visits (in clogs)", main="chronic")
plot(clog(visits) ~ insurance, data = nmes, varwidth = TRUE,xlab="Covered by private insurance", ylab="Physician office visits (in clogs)", main="insurance")
plot(clog(visits) ~ cutfac(hospital, c(0:2, 8)), data = nmes,xlab="Number of hospital stays", ylab="Physician office visits (in clogs)", main="hospital")
plot(clog(visits) ~ gender, data = nmes, varwidth = TRUE,xlab="Gender", ylab="Physician office visits (in clogs)", main="gender")
plot(cutfac(visits, c(0:2, 4, 6, 10, 100)) ~ school, data = nmes, breaks = 9,xlab="Number of years of education", ylab="Physician office visits (number of)", main="school")
par(mfrow = c(1, 1))

############################################################
######## use only if package vcdExtra not available ########
############################################################

## Some exploratory plots for bivariate visualization

cfac = function(x, breaks = NULL) {
  if(is.null(breaks)) breaks = unique(quantile(x, 0:10/10))
  x = cut(x, breaks, include.lowest = TRUE, right = FALSE)
  levels(x) = paste(breaks[-length(breaks)], ifelse(diff(breaks) > 1,
    c(paste("-", breaks[-c(1, length(breaks))] - 1, sep = ""), "+"), ""), sep = "")
  return(x)
} # for transforming a count variable to a factor (for visualization purposes only)

par(mfrow = c(3, 2))
plot(clog(visits) ~ health, data = nmes, varwidth = TRUE,xlab="Self-perceived health status", ylab="Physician office visits (in clogs)", main="health")
plot(clog(visits) ~ cfac(chronic), data = nmes,xlab="Number of chronic conditions", ylab="Physician office visits (in clogs)", main="chronic")
plot(clog(visits) ~ insurance, data = nmes, varwidth = TRUE,xlab="Covered by private insurance", ylab="Physician office visits (in clogs)", main="insurance")
plot(clog(visits) ~ cfac(hospital, c(0:2, 8)), data = nmes,xlab="Number of hospital stays", ylab="Physician office visits (in clogs)", main="hospital")
plot(clog(visits) ~ gender, data = nmes, varwidth = TRUE,xlab="Gender", ylab="Physician office visits (in clogs)", main="gender")
plot(cfac(visits, c(0:2, 4, 6, 10, 100)) ~ school, data = nmes, breaks = 9,xlab="Number of years of education", ylab="Physician office visits (number of)", main="school")
par(mfrow = c(1, 1))

############################################################
#### end of use only if package vcdExtra not available #####
############################################################

#########################
## Poisson regression
#########################

with(nmes, c(mean=mean(visits), var=var(visits), ratio=var(visits)/mean(visits)))

nmes_pois = glm(visits ~ ., data = nmes, family = poisson)
summary(nmes_pois)

round(cbind(beta=coef(nmes_pois), expbeta=exp(coef(nmes_pois)), pct=100*(exp(coef(nmes_pois))-1)),3)

pr=residuals(nmes_pois,"pearson")
phi=sum(pr^2)/df.residual(nmes_pois)

#########################
## LM test for overdispersion
#########################

# help(dispersiontest)

dispersiontest(nmes_pois)
dispersiontest(nmes_pois, trafo = 2)

#########################
## quasipoisson model
#########################

nmes_qpois = glm(visits ~ ., data = nmes, family = quasipoisson)
phi = summary(nmes_qpois)$dispersion
phi

#########################
## NB regression
#########################

nmes_nb = glm.nb(visits ~ ., data = nmes)

round(cbind(p=coef(nmes_pois),qp=coef(nmes_qpois),NB=coef(nmes_nb),se.p=sqrt(diag(vcov(nmes_pois))),se.qp=sqrt(diag(vcov(nmes_qpois))),se.NB=sqrt(diag(vcov(nmes_nb)))),4)
round(sqrt(diag(vcov(nmes_pois)))*sqrt(phi),4)

xb=fitted(nmes_nb, type="response")      # can be fitted with nmes_pois and nmes_qpois, which both provide same picture (close to the one obtained with nmes_nb)
# xb = predict(nmes_nb) 
g = cut(xb, breaks=quantile(xb,seq(0,1,by=0.05)))
m = tapply(nmes$visits, g, mean)
v = tapply(nmes$visits, g, var)

cbind(m,v)

plot(m, v, xlab="Mean", ylab="Variance", main="Mean-Variance relationship",pch=16)
mtext("NMES data",padj=-0.5)
x = seq(min(m)-0.5,max(m)+0.5,0.01)
lines(x, x*phi, lty="dashed",col="red",lwd=2)
lines(x, x*(1+x/nmes_nb$theta),col="blue",lwd=2)
legend("topleft", lty=c("dashed","solid"), col=c("red","blue"), lwd=c(2,2), legend=c("Q. Poisson","Neg. Binom."), inset=0.05)

#########################
## NB regression WITH FIXED theta
#########################

# For a fixed value of theta, the NB is a special case of GLM and can be fitted with glm().
# If theta is unknown, the NB model is not a special case of the GLM and is fitted using glm.nb().

nmes_nb_fixedT=glm(visits ~ ., data = nmes, family=negative.binomial(1))
summary(nmes_nb_fixedT)

# for very large theta, coefficient estimates are close to Poisson

nmes_pois = glm(visits ~ ., data = nmes, family = poisson)
nmes_nb_fixedT=glm(visits ~ ., data = nmes, family=negative.binomial(1000))

round(cbind(p=coef(nmes_pois),qp=coef(nmes_nb_fixedT)),4)

#########################
## ZIP regression
#########################

library(pscl)

nmes_zip0 = zeroinfl(visits ~ . |.,   data = nmes, dist = "poisson")
summary(nmes_zip0)

nmes_zip = zeroinfl(visits ~ . | hospital + chronic + insurance + school + gender,   data = nmes, dist = "poisson")
summary(nmes_zip)

waldtest(nmes_zip0, nmes_zip)
lrtest(nmes_zip0, nmes_zip)

## compare estimated coefficients for count models

fm = list("ML-Pois" = nmes_pois, "Quasi-Pois" = nmes_qpois, "NB" = nmes_nb, "ZIP" = nmes_zip)
round(sapply(fm, function(x) coef(x)[1:8]), digits = 3)

## associated standard errors

round(cbind(sapply(fm, function(x) sqrt(diag(vcov(x)))[1:8])), digits = 3)
round(cbind("ML-Pois" = sqrt(diag(vcov(nmes_pois))), sapply(fm[-1], function(x) sqrt(diag(vcov(x)))[1:8])), digits = 3)

## log-likelihoods and number of estimated parameters

rbind(logLik = sapply(fm, function(x) round(logLik(x))),  Df = sapply(fm, function(x) attr(logLik(x), "df")))

## predicted number of zeros

round(c("Obs" = sum(nmes$visits == 0),
  "ML-Pois" = sum(dpois(0, fitted(nmes_pois))),
  "Quasi-Pois" = NA,
  "NB" = sum(dnbinom(0, mu = fitted(nmes_nb), size = nmes_nb$theta)),
  "ZIP" = sum(predict(nmes_zip, type = "prob")[,1])))