MASS Introductory Session

This is an R Markdown document. Markdown is a simple formatting syntax for authoring web pages (click the MD toolbar button in R Studio for help on Markdown).

When you click the Knit HTML button a web page will be generated that includes both content as well as the output of any embedded R code chunks within the document.

# A command to make MASS datasets available.
library(MASS)

# generate 1000 pairs of normal variates
x <- rnorm(1000)
y <- rnorm(1000)

# Histogram of a mixture of normal distributions.
truehist(c(x, y + 3), nbins = 25)

plot of chunk unnamed-chunk-1


# 2D density plot
contour(dd <- kde2d(x, y))

plot of chunk unnamed-chunk-1


# Pseudo-color plot
image(dd)

plot of chunk unnamed-chunk-1


# Make x = (1,1.5,2,...,19.5,20) and list it
x <- seq(1, 20, 0.5)
x
##  [1]  1.0  1.5  2.0  2.5  3.0  3.5  4.0  4.5  5.0  5.5  6.0  6.5  7.0  7.5
## [15]  8.0  8.5  9.0  9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5
## [29] 15.0 15.5 16.0 16.5 17.0 17.5 18.0 18.5 19.0 19.5 20.0

# w will be used as a 'weight' vector and to give the standard deviations
# of the errors.
w <- 1 + x/2
y <- x + w * rnorm(x)

# Make a data frame of the three columns named x, y, and w and look at it.
# Remove the original x, y and w.
dum <- data.frame(x, y, w)
dum
##       x       y     w
## 1   1.0  1.0084  1.50
## 2   1.5  4.3779  1.75
## 3   2.0  0.9178  2.00
## 4   2.5  2.4718  2.25
## 5   3.0  7.5329  2.50
## 6   3.5  2.9086  2.75
## 7   4.0  4.0597  3.00
## 8   4.5  7.1072  3.25
## 9   5.0  2.7417  3.50
## 10  5.5  5.9513  3.75
## 11  6.0 -0.1003  4.00
## 12  6.5 10.1198  4.25
## 13  7.0  0.8725  4.50
## 14  7.5  6.7014  4.75
## 15  8.0  3.7657  5.00
## 16  8.5  8.2129  5.25
## 17  9.0 13.6275  5.50
## 18  9.5  1.7811  5.75
## 19 10.0  9.9657  6.00
## 20 10.5 11.2661  6.25
## 21 11.0  5.4617  6.50
## 22 11.5 16.8678  6.75
## 23 12.0 18.0417  7.00
## 24 12.5 13.0930  7.25
## 25 13.0  7.3773  7.50
## 26 13.5 11.8460  7.75
## 27 14.0 11.1900  8.00
## 28 14.5 13.5381  8.25
## 29 15.0  9.2161  8.50
## 30 15.5 20.5345  8.75
## 31 16.0 12.0131  9.00
## 32 16.5 26.9279  9.25
## 33 17.0 25.9467  9.50
## 34 17.5 24.9509  9.75
## 35 18.0 26.0009 10.00
## 36 18.5 10.8730 10.25
## 37 19.0  3.0037 10.50
## 38 19.5 22.8037 10.75
## 39 20.0  1.6549 11.00
rm(x, y, w)

# fit a simple linear regression of y on a and look at the analysis
fm <- lm(y ~ x, data = dum)
summary(fm)
## 
## Call:
## lm(formula = y ~ x, data = dum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.568  -3.336  -0.169   3.942  11.766 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.730      2.072    0.35     0.73    
## x              0.875      0.174    5.03  1.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 6.11 on 37 degrees of freedom
## Multiple R-squared: 0.406,   Adjusted R-squared: 0.39 
## F-statistic: 25.3 on 1 and 37 DF,  p-value: 1.29e-05

# Since we know the standard deviations, we can do a weighted regression.
fm1 <- lm(y ~ x, data = dum, weight = 1/w^2)
summary(fm1)
## 
## Call:
## lm(formula = y ~ x, data = dum, weights = 1/w^2)
## 
## Weighted Residuals:
##    Min     1Q Median     3Q    Max 
## -1.528 -0.587 -0.070  0.802  1.630 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.907      0.793    1.14     0.26    
## x              0.850      0.127    6.71    7e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.861 on 37 degrees of freedom
## Multiple R-squared: 0.549,   Adjusted R-squared: 0.537 
## F-statistic:   45 on 1 and 37 DF,  p-value: 6.97e-08

# Fit a smooth regression curve using a modern regression function.
lrf <- loess(y ~ x, dum)

# Make the columns in the data frame visible as variables.
attach(dum)

# Make a standard scatterplot. To this plot we will add the three
# regression lines (or curves) as well as the known true line.
plot(x, y)

# First add in the local regression curve using a spline interpolation
# between the calculated points.
lines(spline(x, fitted(lrf)), col = 2)

# Add in the true regression line (intercept 0, slope 1) with a different
# line type and color.
abline(0, 1, lty = 3, col = 3)

# Add in the unweighted regression line. abline() is able to extract the
# information it needs from the fitted regression object.
abline(fm, col = 4)

# Finally add in the weighted regression line, in line type 4. This one
# should be the most accurate estimate, but may not be, of course.
abline(fm1, lty = 4, col = 5)

plot of chunk unnamed-chunk-2


# A standard regression diagnostic plot to check for heteroscedasticity,
# that is, for unequal variances. The data are generated from a
# heteroscedastic process, so can you see this from this plot?
plot(fitted(fm), resid(fm), xlab = "Fitted Values", ylab = "Residuals")

plot of chunk unnamed-chunk-2


# A normal scores plot to check for skewness, kurtosis and outliers. (Note
# that the heteroscedasticity may show as apparentnon-normality.)
qqnorm(resid(fm))
qqline(resid(fm))

plot of chunk unnamed-chunk-2