## ## Some examples and illustrations of regression & linear models ## Stefan Evert, 5 Feb 2009 ## ## open graphics device with suitably scaled fonts and tight margins for screen presentation dev.new() par(cex=1.3, mar=c(4.1, 4.1, 0.5, 0.5)) ## ## we need to be able to generate synthetic random data for two variables Y, X ## with a specified functional relationship (Y = f(X)) plus Gaussian random errors ## ## f ... relationship between X and Y: Y = f(X) + E [default: Y = X + E] ## n ... number of random samples ## sd ... standard deviation of Gaussian random error E ## x.min, x.max ... range of values for X (random uniform distribution) ## sort=TRUE ... order data points by X (increasing) random.data <- function (f=function (x) x, n=50, sd=1, x.min=-2, x.max=2, sort=FALSE) { x <- runif(n, x.min, x.max) if (sort) x <- sort(X) y <- f(x) + rnorm(n, 0, sd) data.frame(x=x, y=y) } ## ## simple linear regression with lm() ## S <- random.data() # generate random data set (50 points) plot(S$x, S$y, type="p", pch=20, xlab="X", ylab="Y") # always a good idea to plot data first ## linear regression uses lm() function, e.g. Y against X with y ~ x R <- lm(y ~ x, data=S) print(R) # prints regression coefficients abline(R$coefficients, lwd=2) # plot regression line ## correlation coefficient has to be calculated with summary() method print(summary(R)$r.squared) # correlation coefficient = % of variance "explained" ## ## illustrate properties of linear regression with some examples on random data ## ## function to plot data set D with regression line ## - regression for Y ~ X; if both=TRUE, also show inverse regression for X ~ Y do.plot <- function (D, xlim=c(-2,2), ylim=c(-2,2), both=FALSE) { plot(D$x, D$y, type="p", pch=20, xlab="X", ylab="Y", xlim=xlim, ylim=ylim) ## perform standard regression, plot regression line, obtain correlation regression <- lm(y ~ x, data=D) abline(regression$coefficients, col="blue", lwd=2) r2 <- summary(regression)$r.squared ## display correlation coefficient as legend box in bottom right corner ## NB: bquote allows you to interpolate variables and expressions into a mathematical formula ## (this is fairly advanced R stuff, look at ?plotmath for examples) legend("bottomright", inset=.02, bg="white", legend = bquote(R^2 == .(round(r2, 4))) ) if (both) { ## perform inverse regression (X ~ Y) and plot as above regression <- lm(x ~ y, data=D) ab <- regression$coefficients ## regression line is X = a + bY --> Y = -(a/b) + (1/b)X abline(-ab[1]/ab[2], 1/ab[2], col="red", lwd=2) legend("topright", inset=.02, bg="white", lwd=3, col=c("blue", "red"), legend=c("Y ~ X", "X ~ Y")) } } ## regression for linear relationship with different slopes (but same random error) ## (note how we pass an anonymous function to random.data() to specify relationship Y = f(X) + E) do.plot(random.data(function (x) x)) # same as above: y = f(x) = x do.plot(random.data(function (x) .5*x)) # y = f(x) = .5 * x do.plot(random.data(function (x) .2*x)) # y = f(x) = .2 * x do.plot(random.data(function (x) 2*x)) # y = f(x) = 2 * x ## --> correlation depends on ratio between slope (of linear relation) and amount of error (variance) ## here is the same effect for different amounts of random error (s.d. of Gaussian error) do.plot(random.data(function (x) .3*x, sd=.2)) do.plot(random.data(function (x) .3*x, sd=.5)) do.plot(random.data(function (x) .3*x, sd=1)) do.plot(random.data(function (x) .3*x, sd=2)) ## rerun each command several times to see how reliable regression line is! ## strong correlation --> regression lines for Y ~ X and X ~ Y are similar do.plot(random.data(function (x) .3*x, sd=.2), both=TRUE) do.plot(random.data(function (x) .3*x, sd=.5), both=TRUE) do.plot(random.data(function (x) .3*x, sd=1), both=TRUE) ## ## the behaviour of linear regression for non-linear relationships is unpredictable ## ## monotonic non-linear function --> high correlation, regression line captures trend ## (hyperbolic asymptote, arbitrarily chosen for illustration purposes) do.plot(random.data(function (x) 2 - 4/(x+3), sd=.5), both=FALSE) do.plot(random.data(function (x) 2 - 4/(x+3), sd=.15), both=FALSE) do.plot(random.data(function (x) 2 - 4/(x+3), sd=.15), both=TRUE) ## symmetric non-linear function (parabola) --> no linear correlation detected do.plot(random.data(function (x) x^2 - 2, sd=.4), both=FALSE) do.plot(random.data(function (x) x^2 - 2, sd=.4), both=TRUE) ## ## polynomial regression as a special case of the general linear model ## ## obtain random points S with parabolic relationship between X and Y S <- random.data(function (x) x^2 - 2, sd=.4) plot(S$x, S$y, type="p", pch=20, xlab="X", ylab="Y") # redo our standard regression plot abline(lm(y ~ x, S)$coefficients, lwd=2, col="blue") ## for polynomial regression, we need to include a term x^2 = x*x in the model; ## since "*" has a special meaning in model formulae, we have to "protect" it with I() R <- lm(y ~ x + I(x*x), data=S) # quadratic regression = 2-nd degree polynomial print(R) print(summary(R)$r.squared) ## --> polynomial regression achieves very good fit with R^2 > 90% ## we cannot use abline to plot the non-linear regression line (because it is not straight); ## the easiest and most general solution is to obtain model predictions for different values of X L <- data.frame(x=seq(-2, 2, .1)) # L is a data frame with x- and y-coordinates of points on the line L$y <- predict(R, newdata=L) # y-coordinates are model predictions for the specified x-coordinates lines(L$x, L$y, lwd=2, col="red") # draw polynomial regression line in red legend("top", inset=.05, bg="white", lwd=2, col=c("blue","red"), legend=c("linear","polynomial")) ## Exercise: ## Modify the code above to perform cubic polynomial regression (with 3-rd degree polynomial). ## Which additional terms do you need to include in the model formula? Use summary(R) and ## anova(R) to find out which of the regression coefficients are significant. What do you expect? ## Does the regression line look plausible? ## [see below for a short explanation of summary(), anova() and predict() functions] ## ## multiple regression and linear models (U.S. cars in 1973-74) ## data(mtcars) # a popular data set for simple demos ?mtcars # always find out what exactly the data set contains! ## before you perform a linear regression analysis, it's always a good idea to plot ## the data and look for obvious correlations and nonlinearities pairs(~ disp + hp, mtcars) # displacement and power should be correlated ## can we predict fuel consumption (mpg) from displacement, power, weight, ... ? pairs(~ mpg + disp + hp + wt, mtcars) ## we can define our own panel function e.g. to add smoothed trend lines to panels below diagonal my.panel <- function (x, y, ...) { points(x, y, pch=20, ...) # this is what the default panel function does (except for pch=) lines(lowess(x,y), lwd=2, col="blue") # trend line from LOWESS smoother } pairs(~ mpg + disp + hp + wt, mtcars, lower.panel=my.panel) ## we should be able to predict acceleration from power and weight pairs(~ qsec + hp + wt, mtcars, lower.panel=my.panel) ## regression of fuel consumption (mpg = miles per gallon) on power (hp) and weight (wt) ## - 1 stands for the implicit intercept term and is usually omitted from the specification ## - note how 1 + hp + wt corresponds to the columns of the design matrix ## - this model assumes independent, additive linear effects from hp and wt! R <- lm(mpg ~ 1 + hp + wt, data=mtcars) print(R) # information about linear regression ## --> increasing hp and/or wt reduces mpg, but the coefficient for wt is much larger ## --> NB: coefficients (= slope) depend on measurement units and DO NOT show correlation strength! ## the summary() of a linear regression provides information about the statistical linear model summary(R) ## --> shows that mpg is "explained" quite well by hp and wt (R^2 = 82.68%) ## --> standard errors for regression coefficients (as estimators = random variables) ## --> t-tests show whether each regression coefficient has a significant effect (i.e. is non-zero) ## analysis of variance (ANOVA) describes significance of predictor variables in an intuitive way; ## it shows what proportion of the total variance (sum of squares) of mpg each predictor "explains" anova(R) sum((mtcars$mpg - mean(mtcars$mpg))^2) # total sum of squares of the mpg variable for comparison ## --> hp "explains" 678 out of 1126, wt another 252; residual error (ESS) is 195.05 ## --> F-tests compare nested models: mpg ~ 1 vs. mpg ~ 1 + hp vs. mpg ~ 1 + hp + wt ## --> all predictors have a highly significant effect ## NB: ANOVA results depend on ordering of predictors anova(lm(mpg ~ 1 + wt + hp, data=mtcars)) ## --> now wt explains the largest part of the variance, though complete model is the same as above ## --> weight is a much better predictor of fuel consumption by itself than power ## check for interaction between weight and power (wt:hp stands for product weight * power) R <- lm(mpg ~ 1 + hp + wt + hp:wt, data=mtcars) R <- lm(mpg ~ 1 + hp * wt, data=mtcars) # hp * wt is short-hand for terms plus interaction anova(R) ## --> highly significant interaction explains another 65 of sum of squared summary(R) ## --> small positive coefficient of hp:wt interaction: what does this imply? ## confidence intervals for (individual) regression coefficients confint(R) ## we can now predict fuel consumption of new models of cars ## - for illustration, predict Mercedes models from model trained on remaining cars ## - to do so, we split mtcars into test (Mercedes) and training (all other cars) data mtcars.test <- mtcars[8:14, ] mtcars.train <- mtcars[-(8:14), ] # everything except for rows 8 .. 14 R <- lm(mpg ~ hp * wt, data=mtcars.train) summary(R) anova(R) ## prediction uses predict() method of regression model; values of predictor variables ## for new data points are passed in newdata= argument, as data frame with suitable names ## (mtcars.test has just the right form; columns hp and wt would be sufficient) predicted.mpg <- predict(R, newdata=mtcars.test, interval="prediction") ## NB: interval="prediction" calculates confidence intervals for the predicted values ## print predictions for Mercedes models together with the correct values print(cbind(predicted.mpg, mtcars.test)) ## --> not quite spot on, but true values are contained in the (large) confidence intervals ## Exercise: ## Try to model other linear regressions (e.g. examples from pairs plots above) in a similar way. ## ## illustrate regression on two predictor variables with 3D plots ## [fuel consumption (mpg) against power (hp) and weight (wt) of cars in mtcars data set] ## library(rgl) # ------------------------------ if you can't install RGL, skip this part open3d() # RGL doesn't draw in the standard plot window ## plot3d() works almost like the standard plot() command, just in 3 dimensions ## - use type="s" (small shaded spheres) if there aren't very many points in the scatterplot ## - grab box with mouth to rotate, ctrl+click to zoom (or mouse wheel up/down) plot3d(mtcars$hp, mtcars$wt, mtcars$mpg, # plot mpg against hp and wt type="s", size=.3, col="red", # points are little (size=.2) red spheres xlim=c(0,400), ylim=c(0,8), zlim=c(0,40), # specify axis ranges, labels, etc. xlab="power", ylab="weight", zlab="miles/gallon") ## to plot regression plane as 3D surface, build regular grid G over x and y coordinates grid.X <- seq(0, 400, length=21) # X coordinates of grid points (horsepower) grid.Y <- seq(0, 8, length=21) # Y coordinates of grid points (weight) ## --> two-dimensional grid with 20 x 20 facets ## - one facet would be enough for regression plane, but we'll plot curved surfaces later ## - if your computer / graphics card isn't fast enough, reduce number of facets ## regression on power of engine (one predictor variable) R <- lm(mpg ~ hp, data=mtcars) summary(R) # --> R^2 = 60% ## z-coordinates for the surface3d() function are a bit tricky: we need a matrix that ## specifies z = f(x,y) for every combination of x and y values (from grid.X and grid.Y); ## the most convenient method uses outer(), which generates such a matrix from the ## x- and y-coordinates of grid points and the "height function" f (such that z=f(x,y)) ## f() must accept vectors x and y, and return the corresponding vector of height values z f <- function (x,y) predict(R, newdata=data.frame(hp=x, wt=y)) ## Z is the required matrix of height values () Z <- outer(grid.X, grid.Y, f) ## print Z to get a better intuitive grasp of the layout rownames(Z) <- grid.X colnames(Z) <- grid.Y print(round(Z, 2)) ## now plot the regression plane with surface3d() par3d(ignoreExtent=TRUE) # it's important to disable automatic resizing of the 3D scene! surface3d(grid.X, grid.Y, Z, col="blue", alpha=.7) # alpha < 1 makes surface semi-transparent par3d(ignoreExtent=FALSE) # but turn it back on afterwards, otherwise next plot3d() will fail ## compare this with regression on weight of the car (as single predictor) pop3d() # remove previous regression plane from scene R <- lm(mpg ~ wt, data=mtcars) summary(R) # --> R^2 = 75% Z <- outer(grid.X, grid.Y, f) # you know the drill ... par3d(ignoreExtent=TRUE) surface3d(grid.X, grid.Y, Z, col="blue", alpha=.7) par3d(ignoreExtent=FALSE) ## if we want to try more regression models, it's a good idea to wrap these lines in a function lm.3d <- function (formula, data=mtcars) { R <- lm(formula, data=data) print(summary(R)) ## NB: global function f() can't see local variable R, so we need our own defintion of f! f <- function (x,y) predict(R, newdata=data.frame(hp=x, wt=y)) Z <- outer(grid.X, grid.Y, f) par3d(ignoreExtent=TRUE) surface3d(grid.X, grid.Y, Z, col="blue", alpha=.7) par3d(ignoreExtent=FALSE) invisible(R) # return regression object, but make it invisible so it doesn't print automatically } ## regression on both variables gives a much better fit pop3d() lm.3d(mpg ~ hp + wt) # --> R^2 = 82.7% ## there is a significant interaction between power and weight -> curved surface pop3d() lm.3d(mpg ~ hp * wt) # --> R^2 = 88.5% ## --> very good fit, but counterintuitive predictions outside data range! (why?) ## two-dimensional polynomial regression (need to list monomials explicitly) pop3d() R <- lm.3d(mpg ~ hp + wt + I(hp^2) + I(wt^2) + hp:wt, data=mtcars) # --> R^2 = 89.2% ## --> many factors are no longer significant ("causal competition") ## --> polynomial regression improves fit only slightly, but more plausible extrapolation ## with a little fiddling, we can also plot a confidence region for the regression surface ## (NB: we are using interval="confidence" here to obtain a confidence region for the "true" ## regression surface; interval="prediction" as used above returns a larger confidence interval ## into which unseen data points are expected to fall with 95% confidence) f.lower <- function (x,y) predict(R, newdata=data.frame(hp=x, wt=y), interval="confidence")[,2] f.upper <- function (x,y) predict(R, newdata=data.frame(hp=x, wt=y), interval="confidence")[,3] Z.lower <- outer(grid.X, grid.Y, f.lower) Z.upper <- outer(grid.X, grid.Y, f.upper) par3d(ignoreExtent=TRUE) surface3d(grid.X, grid.Y, Z.lower, col="yellow", alpha=.3) # very light yellow surface3d(grid.X, grid.Y, Z.upper, col="yellow", alpha=.3) par3d(ignoreExtent=FALSE) ## --> good confidence along main diagonal (because power and weight are collinear) ## --> unreliable in orthogonal direction where there are hardly any data points ## Exercise: ## There is still one very implausible aspect of the regression surface. What is it? ## Is polynomial regression an appropriate model for these data? Is there a better transformation? rgl.close() # ------------------------------ end of RGL part