#### lm - linear modelling function in R #### # assumptions: # * errors are iid normally distributed # with constant variance and independent of explanatories # * response is related to explanatory variables linearly # Use data from the book on Venice Sea Level Data library(SMPracticals) data(venice) venice # we only want y1 from 1931 - 1981 sea <- venice[venice$year %in% 1931:1981, c("year", "y1")] names(sea) <- c("year", "sealevel") plot(sea$year, sea$sealevel, xlab = "Year", ylab = "Sea level (cm)", pch = 19) #### syntax #### # intercept at year = 0 lm(sealevel ~ year, data = sea) sea$cent.year <- sea$year - mean(sea$year) # intercept at year = x_bar - as in book lm(sealevel ~ cent.year, data = sea) # arguments: # 1. model formula # 2. data = dataframe containing variables # others - you shouldn't need any yet ## model formula ## # y ~ x translates to the model y_i = \beta_0 + x_i \beta_1 + epsilon_i # with epsilon_i iid N(0, sigma^2) # y ~ x - 1 translates to the model y_i = x_i \beta_1 + epsilon_i # i.e. no intercept # y ~ x1 + x2 translates to the model y_i = \beta_0 + \beta_1 x_1i + \beta_2 x_2i + epsilon_i # read more under Details in: ?formula #### working with lm output #### # lm returns a special type of lm object # save it into a variable fit <- lm(sealevel ~ cent.year, data = sea) summary(fit) ### careful!!!! - in lm sigma^2 is estimated with 1/(n-p) here. Why? # useful functions that work on an lm object # the estimated coefficients coef(fit) beta_0_hat <- coef(fit)[1] beta_1_hat <- coef(fit)[2] # The estimated coefficients are beta_0 = 119.6 and beta_1 = 0.57 # the residuals residuals(fit) # the fitted values fitted(fit) # predicting a new value using the model predict(fit, newdata = data.frame(cent.year = 2000 - 1956)) # getting the MLE of sigma^2 n <- nrow(sea) 1/n * sum(residuals(fit)^2) # the unbiased estimate of sigma^2 p <- 2 S_sq <- 1/(n-p) * sum(residuals(fit)^2) S_sq # the estimate for simga^2 is 346.7 # constructing confidence intervals beta_0_se <- sqrt(S_sq) / sqrt(n) beta_1_se <- sqrt(S_sq) / sqrt(sum((sea$year - mean(sea$year))^2)) beta_0_se beta_1_se # compare with sqrt(vcov(fit)) beta_0_CI <- beta_0_hat + c(-1, 1) * qt(0.975, df = n-p) * beta_0_se beta_1_CI <- beta_1_hat + c(-1, 1) * qt(0.975, df = n-p) * beta_1_se beta_0_CI beta_1_CI #compare with confint(fit) # the 95%CI for beta_0 is (114.4, 124.8) # the 95%CI for beta_1 is (0.21, 0.92) ## useful diagnostic plots ## # qqplot of residuals - normality assumption qqnorm(residuals(fit)) qqline(residuals(fit)) # fitted versus residuals - constant variance, iid plot(fitted(fit), residuals(fit)) abline(0, 0, col = "grey") # residuals versus an explanatory variable plot(sea$year, residuals(fit)) # data with fitted line plot(sea$year - mean(sea$year), sea$sealevel, xlab = "Year", ylab = "Sea level (cm)", pch = 19) abline(fit) plot(sea$year, sea$sealevel, xlab = "Year", ylab = "Sea level (cm)", pch = 19) abline(beta_0_hat - beta_1_hat * mean(sea$year), beta_1_hat) #### Current lab #### # lab #6 asks you to use cars dataset - already in R cars # Stopping distances for cars - recorded in 1920's plot(cars$speed, cars$dist, xlab = "Speed (mph)", ylab = "Stopping Distance (ft)") # Remember to write out your results! # I.e. The 95% confidence interval for beta_0 is (blah, blah) ... etc.