--- title: "Visualizing Three or More Numeric Variables, Continued" output: html_document: toc: yes --- ```{r global_options, include=FALSE} knitr::opts_chunk$set(collapse = TRUE) knitr::knit_hooks$set(webgl = rgl::hook_webgl) options(rgl.useNULL = TRUE) ``` ```{r, include = FALSE} library(lattice) library(tidyverse) source("threedata.R") library(rgl) set.seed(12345) ``` ## Other 3D Viewing Options Other options for viewing a 3D scene include: * [Anaglyph 3D](https://en.wikipedia.org/wiki/Anaglyph_3D) using red/cyan glasses. * [Polarized 3D](https://en.wikipedia.org/wiki/Polarized_3D_system) as currently used in many 3D movies. * [Stereograms, stereoscopy](https://en.wikipedia.org/wiki/Stereoscopy) A stereogram * presents each eye with an image from a slightly different viewing angle; * the brain fuses the images into a 3D view in a process called [_stereopsis_](https://en.wikipedia.org/wiki/Stereopsis). This approach is or was used * with virtual reality displays like [Oculus Rift](https://www.meta.com/quest/products/quest-2/); * in the [View-Master toy](https://en.wikipedia.org/wiki/View-Master); * in aerial photography using viewers like this: ![](https://upload.wikimedia.org/wikipedia/commons/3/31/Pocket_stereoscope.jpg) You can create your own viewer with paper or cardboard tubes, for example. A stereo image for the soil data surface: ```{r, message = FALSE} tp <- trellis.par.get() trellis.par.set(theme = col.whitebg()) ocol <- trellis.par.get("axis.line")$col oclip <- trellis.par.get("clip")$panel trellis.par.set(list(axis.line = list(col = "transparent"), clip = list(panel = FALSE))) print(wireframe(soi.fit ~ soi.grid$easting * soi.grid$northing, cuts = 9, aspect = diff(range(soi.grid$n)) / diff(range(soi.grid$e)), xlab = "Easting (km)", ylab = "Northing (km)", shade = TRUE, screen = list(z = 40, x = -70, y = 3)), split = c(1, 1, 2, 1), more = TRUE) print(wireframe(soi.fit ~ soi.grid$easting * soi.grid$northing, cuts = 9, aspect = diff(range(soi.grid$n)) / diff(range(soi.grid$e)), xlab = "Easting (km)", ylab = "Northing (km)", shade = TRUE, screen = list(z = 40, x = -70, y = 0)), split = c(2, 1, 2, 1)) trellis.par.set(list(axis.line = list(col = ocol), clip = list(panel = oclip))) trellis.par.set(tp) ``` ## Some Calibration Experiments ### Simple Linear Stucture Linear structure with normally distributed data: ```{r} n <- 1000 d1 <- data.frame(x = rnorm(n), y = rnorm(n)) d1 <- mutate(d1, z = x + y) ``` ```{r, eval = TRUE} splom(d1) xyplot(z ~ x | equal.count(y), data = d1) xyplot(z ~ x | equal.count(y, 12), data = d1) xyplot(z ~ x | equal.count(y, overlap = -0.8), data = d1) splom(d1, group = abs(x) < 0.1) ``` ```{r, webgl = TRUE} clear3d() points3d(d1) ``` Linear structure with uniformly distributed data: ```{r} d2 <- data.frame(x = runif(n, -1, 1), y = runif(n, -1, 1)) d2 <- mutate(d2, z = x + y) ``` ```{r, eval = TRUE} splom(d2) xyplot(z ~ x | equal.count(y, overlap = -0.8), data = d2) splom(d2, group = abs(x) < 0.1) ``` ```{r, webgl = TRUE} clear3d() points3d(d2) ``` ### Nonlinearity and Noise Normal data with non-linearity and noise: ```{r} d3 <- mutate(d1, z = cos(x) + cos(y) + 0.5 * (x + y) + rnorm(x, 0, .2)) ``` ```{r, eval = TRUE} splom(d3) xyplot(z ~ x | equal.count(y, overlap = -0.8), data = d3) splom(d3, group = abs(x) < 0.1) splom(d3, group = abs(x + 1) < 0.1) ``` Uniform data with non-linearity and noise: ```{r} d4 <- mutate(d2, z = cos(x) + cos(y) + 0.5 * (x + y) + rnorm(x, 0, .1)) ``` ```{r, eval = TRUE} xyplot(z ~ x | equal.count(y, 12, overlap = -0.8), data = d4) splom(d4, group = abs(x) < 0.1) ``` Full data in the background for context: ```{r, eval = TRUE} xyplot(z ~ x | equal.count(y, 12, overlap = -0.8), data = d4, panel = function(x, y, ...) { panel.xyplot(d4$x, d4$z, col = "grey") panel.xyplot(x, y, ...) }) ``` ```{r, webgl = TRUE} clear3d() points3d(d4) ``` ### More Variables Two-dimensional views, conditioning on two variables: ```{r} n <- 10000 d <- data.frame(x1 = rnorm(n), x2 = rnorm(n), x3 = rnorm(n)) d <- mutate(d, x4 = as.numeric(scale(x1 + cos(x2) + sin(x3) + rnorm(n, 0, 0.1)))) ``` ```{r, eval = TRUE} splom(d) splom(~ d, data = d, group = abs(x1) < 0.1) splom(~ d, data = d, group = abs(x1) < 0.1 & abs(x2) < 0.1) ``` ```{r, eval = TRUE, fig.height = 8} xyplot(x4 ~ x3 | equal.count(x1, 4, overlap = -0.8) + equal.count(x2, 4, overlap = -0.8), data = d, asp = 1) ``` Three-dimensional views, conditioning on one variable: ```{r} dd <- filter(d, abs(x1) < 0.1)[2 : 4] ``` ```{r, webgl = TRUE} clear3d() points3d(dd) ``` ```{r, webgl = TRUE} clear3d() spheres3d(dd, radius = 0.05) ``` ```{r, webgl = TRUE} clear3d() spheres3d(dd, radius = 0.05, col = ifelse(abs(dd$x2) < 0.1, "red", "blue")) ``` ```{r, webgl = TRUE} dd1 <- filter(d, abs(x1 - 1) < 0.1)[2 : 4] dd2 <- filter(d, abs(x1 + 1) < 0.1)[2 : 4] clear3d() spheres3d(dd1, radius = 0.05, color = rep("red", ncol(dd1))) spheres3d(dd2, radius = 0.05, color = rep("blue", ncol(dd2))) ``` ```{r, eval = TRUE} ddx <- mutate(d, v = ifelse(abs(x1 - 1) < 0.1, 1, ifelse(abs(x1 + 1) < 0.1, 2, NA))) ddx <- filter(ddx, ! is.na(v)) cloud(x4 ~ x2 * x3, group = v, data = ddx, auto.key = TRUE) cloud(x4 ~ x2 * x3 | v, data = ddx) ``` ## Interactive Explorations ### Tk Sliders Using `tkrplot` A simple animation function using the tkrplot` package: ```{r} library(tkrplot) tkranim <- function(fun, v, lo, hi, res = 0.1, title, label) { tt <- tktoplevel() if (! missing(title)) tkwm.title(tt, title) draw <- function() fun(v) img <- tkrplot(tt, draw) tv <- tclVar(init = v) f <- function(...) { vv <- as.numeric(tclvalue(tv)) if (vv != v) { v <<- vv tkrreplot(img) } } s <- tkscale(tt, command = f, from = lo, to = hi, variable = tv, showvalue = FALSE, resolution = res, orient = "horiz") if (! missing(label)) tkpack(img, tklabel(tt, text = label), s) else tkpack(img, s) } ``` Interactive conditioning for one of the calibration examples: ```{r, eval = FALSE} fd4 <- function(v) print(splom(~ d4, group = abs(x - v) < 0.1, data = d4)) tkranim(fd4, 0, -1, 1, 0.05) ``` Interactive slicing for the soil resistivity surface: ```{r, eval = FALSE} f <- function(i) { e <- eastseq[i] idx <- which(sf$easting == e) r <- range(sf$fit) print(xyplot(fit ~ northing, data = sf[idx, ], ylim = r, type = "l", main = sprintf("Easting = %f", e))) } tkranim(f, 1, 1, 85, 1) ``` Interactive conditioning for the soil resistivity data: ```{r, eval = FALSE} binwidth <- 0.1 ## bad idea to use a global variable g <- function(i) { e <- eastseq[i] idx <- which(abs(soil$easting - e) <= binwidth) ry <- range(soil$resistivity) rx <- range(soil$northing) print(xyplot(resistivity ~ northing, data = soil[idx, ], xlim = rx, ylim = ry, main = sprintf("Easting = %f", e))) } tkranim(g, 1, 1, 85, 1) ``` ### Shiny Sliders A version of the animation function using the [`shiny` package](https://shiny.rstudio.com/): ```{r} shinyAnim <- function(fun, v, lo, hi, step = NULL, label = "Frame") { require(shiny) shinyApp( ui = fluidPage( sidebarLayout( sidebarPanel(sliderInput("n", label, lo, hi, v, step, animate = animationOptions(100))), mainPanel(plotOutput("plot")) ) ), server = function(input, output, session) { session$onSessionEnded(function() shiny::stopApp()) output$plot <- renderPlot(fun(input$n)) } ) } ``` ```{r, eval = FALSE} shinyAnim(fd4, 0, -1, 1, 0.05) ``` ```{r, eval = FALSE} shinyAnim(f, 1, 1, 85) ``` ```{r, eval = FALSE} shinyAnim(g, 1, 1, 85, 1) ``` ### Animations as Movies To show how a plot changes with a single parameter it is possible to pre-compute the images and assemble them into a movie. This is easy to do in Rmarkdown, but requires the `ffmpeg` software to be installed. Currently, `ffmpeg` is not available on the CLAS Linux systems. A [simple example](https://stat.uiowa.edu/~luke/examples/anim.html) ([Rmd](https://stat.uiowa.edu/~luke/examples/anim.Rmd)). ### Linking and Brushing Linked plots and _brushing_ were pioneered at Bell Laboratories in the early 1980s. * Some videos are available in the ASA Statistics Computing and Statistical Graphics Sections' [video library](https://community.amstat.org/jointscsg-section/media/videos/). * Trellis displays were developed by the same research group [_after_ the development of brushing](http://civilstat.com/2016/10/when-static-graphs-beat-interactives/). The [`iplots`](http://www.iplots.org/) package provides a set of basic plot that are all linked by default. ```{r, eval = FALSE} library(iplots) with(soil, iplot(northing, resistivity)) ``` ```{r, eval = FALSE} with(soil, iplot(easting, northing)) ``` The `rggobi` package provides another framework for linked plots and brusing within R. ```{r, eval = FALSE} library(rggobi) ggobi(soil) ``` Several other tools with linking/brushing support and good integration with R are under development. * One of the furthest along seems to be [`rbokeh`](https://hafen.github.io/rbokeh/). * `plotly` also provides some support for linked brushing. ## Example: Ethanol Data The `ethanol` data frame in the `lattice` package contains data from an experiment efficiency and emissions in small one-cylinder engines. The data frame contains 88 observations on three variables: * `NOx`: Concentration of nitrogen oxides (NO and NO2) in micrograms. * `C` Compression ratio of the engine. * `E` Equivalence ratio, a measure of the richness of the air and ethanol fuel mixture. A scatterplot matrix: ```{r} library(lattice) splom(ethanol) ``` Conditional plots of `NOx` against `E` for each level of `C`: ```{r, eval = TRUE} xyplot(NOx ~ E | C, data = ethanol) ggplot(ethanol, aes(x = E, y = NOx)) + geom_point() + facet_wrap(~ C) ``` A single plot with color to separate levels of `C`: ```{r, eval = TRUE} xyplot(NOx ~ E, group = C, data = ethanol, auto.key = TRUE) ggplot(ethanol, aes(x = E, y = NOx, color = factor(C))) + geom_point() ``` Augmented with smooths: ```{r, eval = TRUE} xyplot(NOx ~ E, group = C, type = c("p", "smooth"), data = ethanol, auto.key = TRUE) ggplot(ethanol, aes(x = E, y = NOx, color = factor(C))) + geom_point() + geom_smooth(se = FALSE) ``` Conditional plots of `NOx` against `C` given levels of `E`: ```{r, eval = TRUE} with(ethanol, { Equivalence.Ratio <- equal.count(E, number = 9, overlap = 0.25) xyplot(NOx ~ C | Equivalence.Ratio, panel = function(x, y) { panel.xyplot(x, y) panel.loess(x, y, span = 1) }, aspect = 2.5, layout = c(5, 2), xlab = "Compression Ratio", ylab = "NOx (micrograms/J)") }) ``` An interactive 3D view: ```{r, webgl = TRUE} library(rgl) clear3d() with(ethanol, points3d(E, C / 5, NOx / 5, size = 4, col = rep("black", nrow(ethanol)))) ``` ## Example: Earth Quake Locations and Magnitudes The `quakes` data frame contains data on locations of seismic events of magnitude 4.0 or larger in a region near Fiji. The time frame is from 1964 to perhaps 2000. More recent data is available from a number of sources on the web. Some scatter plot matrices: ```{r, fig.height = 7, fig.align = "center"} library(lattice) splom(quakes) splom(quakes, cex = 0.1) splom(quakes, alpha = 0.1) splom(quakes, panel = panel.smoothScatter) ``` Where the quakes are located: ```{r, eval = TRUE, message = FALSE} library(maps) map("world2", c("Fiji", "Tonga", "New Zealand")) with(quakes, points(long, lat, col = "red", cex = 0.1)) map("world2", c("Fiji", "Tonga", "New Zealand")) with(quakes, points(long, lat, col = heat.colors(nrow(quakes))[rank(depth)], cex = 0.1)) map("world2", c("Fiji", "Tonga", "New Zealand")) with(quakes, points(long, lat, col = heat.colors(nrow(quakes))[rank(depth)])) ``` Showing `depth` with grouping: ```{r, eval = TRUE} quakes2 <- mutate(quakes, depth_cut = cut_number(depth, 3)) xyplot(lat ~ long, aspect = "iso", groups = depth_cut, data = quakes2, auto.key = list(columns = 3, title = "Depth")) ggplot(quakes2, aes(x = long, y = lat, color = depth_cut)) + geom_point() + theme_bw() + coord_map() ggplot(quakes2, aes(x = long, y = lat, colour = depth_cut, size = mag)) + geom_point() + theme_bw() + coord_map() ``` Showing `depth` with a trellis display: ```{r, eval = TRUE} xyplot(lat ~ long | depth_cut, aspect = "iso", data = quakes2) xyplot(lat ~ long | depth_cut, aspect = "iso", type = c("p", "g"), data = quakes2, auto.key = TRUE) ggplot(quakes2, aes(x = long, y = lat)) + geom_point(color = "blue") + facet_wrap(~ depth_cut, nrow = 1) + theme_bw() + coord_map() ``` Adding the full data for background context: ```{r, eval = TRUE} xyplot(lat ~ long | cut(depth, 3), aspect = "iso", type = c("p", "g"), panel = function(x, y, ...) { panel.xyplot(quakes$long, quakes$lat, col = "grey") panel.xyplot(x, y, ...) }, data = quakes) ## quakes does not contain the depth_cut variable used in the facet ggplot(quakes2, aes(x = long, y = lat)) + geom_point(data = quakes, color = "gray") + geom_point(color = "blue") + facet_wrap(~ depth_cut, nrow = 1) + theme_bw() + coord_map() ``` 3D views of `lat`, `long`, and `depth`: ```{r, eval = TRUE} cloud(-depth ~ long * lat, data = quakes) ``` ```{r, webgl = TRUE} library(rgl) bg3d(col = "black") clear3d() with(quakes, points3d(long, lat, -depth / 50, size = 2, col = rep("white", nrow(quakes)))) ``` ```{r, webgl = TRUE} clear3d() with(quakes, points3d(long, lat, -depth / 50, size = 2, col = heat.colors(nrow(quakes))[rank(mag)])) ``` ## Visualizing 2D and 3D Density Estimates ### 2D Density Estimates We have already seen some examples of plotting 2D density contours on a scatter plot using `geom_density2d`. Here is a version that explicitly creates the density estimate: ```{r} data(geyser, package = "MASS") geyser <- mutate(geyser, duration = lag(duration)) geyser <- filter(geyser, ! is.na(duration)) plot(waiting ~ duration, data = geyser) sc <- function(x) as.numeric(scale(x)) de <- with(geyser, MASS::kde2d(sc(duration), sc(waiting), n = 50, h = 0.5)) contour(de) with(geyser, points(sc(duration), sc(waiting))) ``` The density estimate can also be viewed with a perspective plot or an interactive 3D plot: ```{r} persp(de) ``` ```{r, webgl = TRUE} clear3d() bg3d(col = "white") surface3d(unique(de$x), unique(de$y), de$z, col = "red") box3d() axes3d() ``` ### 3D Density Estimates A 3D normal mixture: ```{r} library(rgl) library(misc3d) x <- matrix(rnorm(9000), ncol = 3) x[1001 : 2000, 2] <- x[1001 : 2000, 2] + 4 x[2001 : 3000, 3] <- x[2001 : 3000, 3] + 4 ``` ```{r, webgl = TRUE} clear3d() bg3d(col = "white") points3d(x = x[, 1], y = x[, 2], z = x[, 3], size = 2, color = "black") ``` ```{r} g <- expand.grid(x = seq(-4, 8.5, len = 30), y = seq(-4, 8.5, len = 30), z = seq(-4, 8.5, len = 30)) ``` Density estimates for two bandwidths: ```{r} d <- kde3d(x[, 1], x[, 2], x[, 3], 0.2, 30, c(-4, 8.5))$d d2 <- kde3d(x[, 1], x[, 2], x[, 3], 0.5, 30, c(-4, 8.5))$d ``` 3D plot of data and contour for bw = 0.2: ```{r, webgl = TRUE} clear3d() xv <- seq(-4, 8.5, len = 30) points3d(x = x[, 1], y = x[, 2], z = x[, 3], size = 2, color = "black") contour3d(d, 20 / 3000, xv, xv, xv, color = "red", alpha = 0.5, add = TRUE) ``` 3D plot of data and a contour for bw = 0.5: ```{r, webgl = TRUE} clear3d() points3d(x = x[, 1], y = x[, 2], z = x[, 3], size = 2, color = "black") contour3d(d2, 20 / 3000, xv, xv, xv, color = "red", alpha = 0.5, add = TRUE) ``` 3D plot of several contours for bw = 0.5: ```{r, webgl = TRUE} clear3d() contour3d(d2, c(10, 25, 40) / 3000, xv, xv, xv, color = c("red", "green", "blue"), alpha = c(0.2, 0.5, 0.7), add = TRUE) ``` Density contours for the `quakes` data: ```{r, webgl = TRUE} library(misc3d) library(rgl) clear3d() bg3d(col = "white") points3d(quakes$long / 22, quakes$lat / 28, -quakes$depth / 640, size = 2, col = rep("black", nrow(quakes))) box3d(col = "gray") ``` ```{r, webgl = TRUE} d <- kde3d(quakes$long, quakes$lat, -quakes$depth, n = 40) contour3d(d$d, level = exp(-12), x = d$x / 22, y = d$y / 28, z = d$z / 640, color = "green", color2 = "gray", scale = FALSE, add = TRUE) ``` Re-scaling depth to (approximately) match surface location scale: ```{r, webgl = TRUE} clear3d() points3d(quakes$long, quakes$lat, -quakes$depth / 110, size = 2, col = rep("black", nrow(quakes))) box3d(col = "gray") ``` ```{r, webgl = TRUE} d <- kde3d(quakes$long, quakes$lat, -quakes$depth, n = 40) contour3d(d$d, level = exp(-12), x = d$x, y = d$y, z = d$z / 110, color = "green", color2 = "gray", scale = FALSE, add = TRUE) ``` Variant with alpha blending and two contour surfaces: ```{r, webgl = TRUE} clear3d() points3d(quakes$long, quakes$lat, -quakes$depth / 110, size = 2) box3d(col = "gray") d <- kde3d(quakes$long, quakes$lat, -quakes$depth, n = 40) contour3d(d$d, level = exp(c(-10, -12)), x = d$x, y = d$y, z = d$z / 110, color = c("red", "green"), alpha = 0.1, scale = FALSE, add = TRUE) ``` ## Example: Environmental Data Daily measurements of ozone concentration, wind speed, temperature and solar radiation in New York City from May to September of 1973. This analysis is from Cleveland's _Visualizing Data_ book. A goal is to see to what degree ozone variation can be explained by radiation, temperature, and wind variation. ```{r} library(lattice) splom(environmental, pscale = 0) histogram(environmental$ozone) histogram(environmental$ozone ^ (1 / 3)) ## (Cleveland uses cube root) env <- mutate(environmental, ozone = ozone ^ (1 / 3)) splom(env) ``` Ozone seems to increase with temperature and decrease with wind. Some conditional plots: ```{r, eval = TRUE, fig.height = 8} xyplot(ozone ~ radiation | equal.count(temperature, 4) * equal.count(wind, 4), data = env, type = c("p", "g", "smooth"), asp = 1.5, col.line = "red") ``` ```{r, eval = TRUE} xyplot(ozone ~ temperature | equal.count(radiation, 3, overlap = 0) * equal.count(wind, 2, overlap = 0.2), data = env, type = c("p", "g", "smooth")) ``` ```{r, eval = TRUE} xyplot(ozone ~ temperature | equal.count(radiation, 4, overlap = 0), group = ifelse(wind >= 10, "high", "low"), data = env, auto.key = TRUE, type = c("p", "g", "smooth")) ``` 3D scatter plot with wind split in two levels, `red` for low wind, `blue` for high wind: ```{r} ne <- names(env) wcut <- with(env, cut_number(wind, 2)) idx <- as.integer(wcut) cols <- c("red", "blue")[idx] ``` ```{r, webgl = TRUE} library(rgl) clear3d() points3d(scale(env[-4]), radius = 0.05, col = cols) axes3d() title3d(xlab = ne[1], ylab = ne[2], zlab = ne[3]) ``` Fitting a `loess` surface: ```{r} fit.env <- loess(ozone ~ wind * temperature * radiation, span = 1.0, degree = 2, data = env) scaleBy <- function(x, y) (x - mean(y)) / sd(y) ``` Visualising the surface: ```{r} tmesh <- with(env, do.breaks(range(temperature), 50)) rmesh <- with(env, do.breaks(range(radiation), 50)) wmesh <- with(env, do.breaks(range(wind), 50)) grid <- expand.grid(temperature = tmesh, radiation = rmesh, wind = wmesh) grid$fit <- as.numeric(predict(fit.env, newdata = grid)) gridw <- expand.grid(temperature = tmesh, radiation = rmesh, wind = 8) fit.pred1 <- predict(fit.env, newdata = mutate(gridw, wind = 8)) fit.pred2 <- predict(fit.env, newdata = mutate(gridw, wind = 11)) gridw$fit1 <- as.numeric(fit.pred1) gridw$fit2 <- as.numeric(fit.pred2) ``` ```{r, webgl = TRUE} library(rgl) clear3d() with(env, surface3d(scaleBy(unique(gridw$temperature), temperature), scaleBy(unique(gridw$radiation), radiation), scaleBy(gridw$fit1, ozone), col = "red")) with(env, surface3d(scaleBy(unique(gridw$temperature), temperature), scaleBy(unique(gridw$radiation), radiation), scaleBy(gridw$fit2, ozone), col = "blue")) axes3d() box3d() title3d(xlab = "temp", ylab = "rad", zlab = "ozone") ``` Conditional plots of the fit surface: ```{r, eval = TRUE, fig.height = 8} g <- with(env, expand.grid(radiation = do.breaks(range(radiation), 50), temperature = do.breaks(range(temperature), 3), wind = do.breaks(range(wind), 3))) g$fit <- as.numeric(predict(fit.env, newdata = g)) xyplot(fit ~ radiation | temperature * wind, data = g, type = c("g", "l"), asp = 1.5) ``` Cleveland points out that there is no data for high temperature/high wind or low temperature/low wind, to estimates in those regions are unreliable: ```{r, eval = TRUE} xyplot(wind ~ temperature, data = env) ``` Setting the corresponding fit values to `NA` emphasizes the more reliable results: ```{r, eval = TRUE, fig.height = 8} ## Cleveland Fig 5.9 gg <- g gg$fit[gg$temperature > 80 & gg$wind > 20] <- NA gg$fit[gg$temperature < 80 & gg$wind < 5] <- NA xyplot(fit ~ radiation | temperature * wind, data = gg, type = c("g", "l"), asp = 1.5) ``` Residuals plots: ```{r, eval = TRUE} env$fit <- predict(fit.env) env$res <- env$ozone - env$fit xyplot(res ~ fit, data = env) ``` Spread-location plots: ```{r, eval = TRUE} xyplot(sqrt(abs(res)) ~ fit, data = env) ``` Residual and fit spread (rfs) plots ```{r, eval = TRUE} rfs(fit.env) ``` ## Medical Imaging Data Medical imaging technologies, such as [magnetic resonance imaging (MRI)](https://en.wikipedia.org/wiki/Magnetic_resonance_imaging) produce a three-dimensional array of _intensity values_, one for each _volume element_, or _voxel_. A common visualization allows the viewer to interactively examine image plots of the intensities for different slices through the volume. ```{r, eval = FALSE} readVol <- function(fname, dim = c(181, 217, 181)) { f <- gzfile(fname, open = "rb") on.exit(close(f)) b <- readBin(f, "integer", prod(dim), size = 1, signed = FALSE) array(b, dim) } imgfile <- "/group/statsoft/data/Brainweb/images/T1/t1_icbm_normal_1mm_pn3_rf20.rawb.gz" v <- readVol(imgfile) library(misc3d) slices3d(v) ``` The interactive heatmap can also be used for functions of three variables, such as a density estimate: ```{r, eval = FALSE} slices3d(d2) ``` ## Notes ### What to Look for in a Scatter Plot The easiest features to see in a acatterplot: - outliers, clusters (zero-dimensional objects) - curves (one-dimensional objects) Changes in point density are also perceivable, but much less easily. ### Conditioning and Projection Scatter plots show a two-dimensional projection of the data. - Projections preserve dimension of zero and one dimensional objects. - Two and higher dimensional objects are collapsed to two dimensions. Zero and one dimensional objects can be occluded in a projection by higher-dimensional objects. Conditioning can reduce dimensionality: - conditioning on one variable reduces dimensionality by one; - conditioning on two variables reduces dimensionality by two. Conditioning on more than two variables is rarely practical. Many different projections are possible. Some useful tools for exploring: - 3D visualizations; - scatter plot matrices; - principal components analysis; - grand and guided tours; - scagnostics. ### 3D Visualizations - 3D visualizations take advantage of our visual system's ability to create a 3D mental image from a 2D rendering. - Several techniques are used to help our visual system: - motion; - depth cueing; - perspective projection. - For viewing data it is often helpful to turn off the perspective adjustment. - Most systems for 3D viewing provide a way to do this. An illustration is provided by the `randu` data set of 400 triples of successive random numbers from the [RANDU](https://en.wikipedia.org/wiki/RANDU) pseudo-random number generator. The flaw is much easier to see then the perspective adjustment is removed. ```{r, webgl = TRUE} library(rgl) clear3d() points3d(randu, size = 5) ``` ```{r, webgl = TRUE} par3d(FOV = 1) ## removes perspective distortion ``` ## Two Classification Problems Two data sets where the objective is to identify class membership from measurements - Australian Crabs - Olive Oils ### Australian Crabs The data frame `crabs` in the `MASS` package contains measurement on crabs of a species that has been split into two based on color, orange or blue. Preserved specimens lose their color (and possibly ability to identify sex). Data were collected to help classify preserved specimens. It would be useful to have a fairly simple classification rule. An initial view: ```{r, fig.height = 7, fig.align = "center"} data(crabs, package = "MASS") splom(~ crabs[4 : 8], group = paste(sex, sp), data = crabs, auto.key = TRUE, pscale = 0) ``` - The variables are highly correlated, reflecting overall size and age. - The `RW * CL` or `RW * CW` plots separate males and females well, at least for larger crabs. Some possible next steps: - Consider a `log` transformation if the marginal distributions are skewed. - Reduce the correlation by a linear or nonlinear transformation. A quick look at histograms: ```{r} library(tidyr) library(ggplot2) ggallhists <- function(data, nb = nclass.Sturges(data[[1]])) { ggplot(gather(data, var, val), aes(val)) + geom_histogram(bins = nb) + xlab("") + ylab("") + facet_wrap(~ var, scales = "free") } ggallhists(crabs[4 : 8]) ``` - It does not look like a `log` transformation is needed. Principal component analysis identifies a rotation of the data that produces uncorrelated _scores_. ```{r, fig.height = 7, fig.align = "center"} pc <- princomp(crabs[4 : 8]) splom(~ pc$scores, group = paste(sex, sp), data = crabs, auto.key = TRUE, pscale = 0) ``` - There is some separation of the classes in the plot of the second and third components. - A nonlinear transformation that scales relative to one of the variables, say `CL`, may do better. ```{r, fig.height = 7, fig.align = "center"} cr <- mutate(crabs, FLCL = FL / CL, RWCL = RW / CL, CWCL = CW / CL, BDCL = BD / CL) splom(~ cr[9 : 12], group = paste(sex, sp), data = cr, auto.key = TRUE, pscale = 0) ``` - The `CWCL * BDCL` plot shows good species separation by a line. - Using `CWCL - BDCL` will pick out the axis of separation. ```{r, fig.height = 7, fig.align = "center"} splom(~ data.frame(FLCL, RWCL, CWCL - BDCL), group = paste(sex, sp), data = cr, auto.key = TRUE, pscale = 0) ``` - The `FLCL * (CWCL - BDCL)` plot shows perfect separation. - After another rotation to `FLCL - (CWCL - BDCL) = FLCL + BDCL - CWCL` a simple criterion emerges. ```{r} stripplot(paste(sex, sp) ~ FLCL + BDCL - CWCL, data = cr, jitter = TRUE, group = paste(sex, sp), panel = function(...) { panel.stripplot(...) panel.abline(v = -0.23, lty = 2) }) ``` - The sexes are well separated by `RWCL` for larger crabs; less so for smaller ones ```{r} xyplot(RWCL ~ CL | sp, data = cr, group = paste(sex), auto.key = TRUE) ``` - Dividing the crabs at `CL` of 30 into small and large gives a simple rule that works fairly well across species and sizes. ```{r} xyplot(RWCL ~ CL | sp, data = cr, group = paste(sex), auto.key = TRUE, panel = function(...) { panel.xyplot(...) ## panel.segments(c(0, 30), c(0.42, 0.39), c(30, 60), c(0.42, 0.39), ## lty = 2) ds <- 0.42 dl <- 0.39 panel.lines(c(0, 30, 30, 60), c(ds, ds, dl, dl), lty = 2, col = "black") }) ``` - Using principle component analysis on the scaled data produces a similar result based on the first two principal components. ```{r, fig.height = 7, fig.align = "center"} pc <- princomp(cr[9 : 12]) splom(~ pc$scores, group = paste(sex, sp), data = cr, auto.key = TRUE, pscale = 0) ``` - When designing a classifier it is usually a good idea to split the data into a _training set_ and a _test set. - After building the classifier with the training data its performance can be assessed using the test data. ```{r, include = FALSE} ## This reproduces what biplot() does and could be used to make a ## ggplot version. The ggbiplot function in ggfortify does something ## different with the loadings. bp <- function(pc) { choices <- 1 : 2 n <- pc$n.obs scores <- pc$scores[, choices] loadings <- pc$loadings[, choices] lam <- pc$sdev[choices] lam <- lam * sqrt(n) scores <- t(t(scores) / lam) vars <- t(t(loadings) * lam) plot(scores) vars <- vars * max(abs(scores)) / max(abs(vars)) * 0.8 arrows(0, 0, vars[, 1], vars[, 2], col = "red") text(vars, rownames(loadings), col = "red") invisible() } ``` ### Olive Oils The `olives` data frame in the `extracat` package contains measurements on the proportion of 8 fatty acids (% times 100) in olive oil samples. - Samples come from 9 areas grouped in three regions. ![](img/olive-oils-map.png) - A goal is to see if region and perhaps area can be determined from these measurements. - This might help in discovering cheap varieties masquerading as expensive ones. - A useful approach is to first try to identify the regions and then the areas within regions. A first look at the data: ```{r, fig.height = 8, fig.align = "center"} library(lattice) olives <- read.csv("https://stat.uiowa.edu/~luke/data/olives.csv") splom(~ olives[3 : 10], cex = 0.1, group = Region, data = olives, auto.key = TRUE, pscale = 0) ``` - Oils from the `South` region have a much higher `eicosenoic` content than oils from the other regions. ```{r} stripplot(Region ~ eicosenoic, data = olives, type = c("p", "g"), jitter = TRUE, group = Region) ``` - To separate `North` from `Sardinia` we can remove the `South` samples. ```{r, fig.height = 8, fig.align = "center"} ons <- filter(olives, Region != "South") ons <- droplevels(ons) splom(~ ons[3 : 10], cex = 0.1, group = Region, data = ons, auto.key = TRUE, pscale = 0) ``` - The `arachidic * linoleic` plot allows these two regions to be separated: ```{r} xyplot(arachidic ~ linoleic, group = Region, data = ons, type = c("p", "g"), auto.key = TRUE) ``` - Among the non-`South` oils the ones from `Sardinia` can be identified as the ones with `linoleic > 1000` and `arachidic > 40`. Looking at the oils for the two areas within Sardinia: ```{r, fig.height = 8, fig.align = "center"} sar <- filter(olives, Region == "Sardinia") sar <- droplevels(sar) splom(~ sar[3 : 10], cex = 0.1, group = Area, data = sar, auto.key = TRUE, pscale = 0) ``` - Within `Sardinia` the `oleic * linoleic` plot suggests looking at `oleic - linoleic` ```{r} stripplot(Area ~ oleic - linoleic, data = sar, jitter = TRUE, type = c("p", "g")) ``` The inland oils are identified by having `oleic - linoleic > 6000`. ## Grand and Guided Tours An alternative approach is to use `rggobi` to explore the data with _grand tours_ and _guided tours_. - The _grand tour_ is a framework for animating smoothly through a sequence of randomly chosen projections. - A _guided tour_ encourages the tour to move towards projections that are more interesting according to one of several criteria. - The `rggobi` approach is documented in > Cook, Dianne and Swayne, Deborah F. (2007), _Interactive and Dynamic > Graphics for Data Analysis_ with R and GGobi_, Springer Verlag. To explore this approach with the scaled `crabs` data: ```{r, eval = FALSE} library(rggobi) ggobi(cr) ``` ## Scagnostics - With a large number of variables it isn't realistic to view all possible pairwise scatter plots. - This becomes even more of an issue with automated projection systems like the grand tour. - _Scagnostics_, or scatter plot diagnostics, are numerical summaries of interesting features in a scatter plot. - The projection pursuit indices used in the guided tour are a form of scagnostic. - One possible use is to automatically compute sacgnostics for all possible pairwise scatter plots and view the ones with high scores. - A [German Wikipedia article](https://de.wikipedia.org/wiki/Scagnostics) has some graphics that illustrate the proposed diagnostics. ![](https://upload.wikimedia.org/wikipedia/commons/thumb/c/c7/ScagnosticsConvex.svg/1024px-ScagnosticsConvex.svg.png) ![](https://upload.wikimedia.org/wikipedia/commons/thumb/0/02/ScagnosticsClumpy.svg/1024px-ScagnosticsClumpy.svg.png) - Some references: > Wilkinson, Leland, Anushka Anand, and Robert > L. Grossman. "Graph-theoretic scagnostics." INFOVIS. Vol. 5. 2005. > Dang, Tuan Nhon, and Leland Wilkinson. "Scagexplorer: Exploring > scatterplots by their scagnostics." Visualization Symposium > (PacificVis), 2014 IEEE Pacific. IEEE, 2014. - An R package implementing these ideas is available. ```{r, fig.height = 8, fig.align = "center"} library(scagnostics) s <- scagnostics(olives[3 : 10]) heatmap(s) colnames(s)[max.col(s)] splom(t(unclass(s)), varname.cex = 0.7, pscale = 0) ``` ## Two Sliders Using Shiny This function creates a `shiny` animation using two sliders: ```{r, eval = FALSE} ## two-parameter shiny ## performance is not good shinyAnim2 <- function(fun, v, lo, hi, step = NULL, label = "Frame", fps = 4) { require(shiny) msecs <- 1000 / fps shinyApp( ui = fluidPage( sidebarLayout( sidebarPanel( sliderInput("v1", label[1], lo[1], hi[1], v[1], step[1], animate = animationOptions(msecs)), sliderInput("v2", label[2], lo[2], hi[2], v[2], step[2], animate = animationOptions(msecs))), mainPanel(plotOutput("plot")) ) ), server = function(input, output, session) { session$onSessionEnded(function() shiny::stopApp()) output$plot <- renderPlot(fun(input$v1, input$v2)) }) } ``` Some calibration data: ```{r, eval = FALSE} library(tidyverse) library(shiny) library(lattice) library(ggplot2) n <- 10000 d <- data.frame(x1 = runif(n, -1, 1), x2 = runif(n, -1, 1), x3 = rnorm(n)) d <- mutate(d, x4 = as.numeric(scale(x1 + cos(x2) + sin(x3) + rnorm(n, 0, 0.1)))) ``` Animation using `splom`: ```{r, eval = FALSE} fd <- function(v1, v2) splom(~ d, data = d, group = abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1) shinyAnim2(fd, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05)) ``` Using `xyplot`: ```{r, eval = FALSE} fd2 <- function(v1, v2) xyplot(x4 ~ x3, group = abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1, data = d, pch = 19) shinyAnim2(fd2, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05)) ``` Using `ggplot`: ```{r, eval = FALSE} fd3 <- function(v1, v2) ggplot(d, aes(x = x3, y = x4)) + geom_point(color = "grey") + geom_point(data = filter(d, abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1), color = "red") shinyAnim2(fd3, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05)) ``` Rendering of `ggplot` and `lattice` graphics is slow and not ideal for animations; using base graphics helps a little: ```{r, eval = FALSE} fd4 <- function(v1, v2) { plot(x4 ~ x3, data = d, pch = 19, col = "blue", asp = 1) with(filter(d, abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1), points(x3, x4, col = "red", pch = 19)) } shinyAnim2(fd4, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05)) ```