---
title: "Visualizing Three or More Numeric Variables, Continued"
output:
html_document:
toc: yes
---
```{r global_options, include=FALSE}
knitr::opts_chunk$set(collapse=TRUE)
knitr::knit_hooks$set(webgl = rgl::hook_webgl)
options(rgl.useNULL=TRUE)
```
```{r, include = FALSE}
library(lattice)
library(tidyverse)
source("threedata.R")
library(rgl)
set.seed(12345)
```
## Other 3D Viewing Options
Other options for viewing a 3D scene include:
* [Anaglyph 3D](http://en.wikipedia.org/wiki/Anaglyph_3D) using
red/cyan glasses.
* [Polarized 3D](http://en.wikipedia.org/wiki/Polarized_3D_system) as
currently used in many 3D movies.
* [Stereograms, stereoscopy](http://en.wikipedia.org/wiki/Stereoscopy)
A stereogram
* presents each eye with an image from a slightly different viewing angle;
* the brain fuses the images into a 3D view in a process called
[_stereopsis_](https://en.wikipedia.org/wiki/Stereopsis).
This approach is or was used
* with virtual reality displays like
[Oculus Rift](https://www.oculus.com/rift/);
* in the [View-Master toy](https://en.wikipedia.org/wiki/View-Master);
* in aerial photography using viewers like this:
![](https://upload.wikimedia.org/wikipedia/commons/3/31/Pocket_stereoscope.jpg)
You can create your own viewer with paper or cardboard tubes, for example.
A stereo image for the soil data surface:
```{r, message = FALSE}
tp <- trellis.par.get()
trellis.par.set(theme=col.whitebg())
ocol <- trellis.par.get("axis.line")$col
oclip <- trellis.par.get("clip")$panel
trellis.par.set(list(axis.line = list(col = "transparent"),
clip = list(panel = FALSE)))
print(wireframe(soi.fit ~ soi.grid$easting * soi.grid$northing,
cuts = 9,
aspect=diff(range(soi.grid$n))/diff(range(soi.grid$e)),
xlab = "Easting (km)",
ylab = "Northing (km)", shade=T,
screen = list(z = 40, x = -70, y = 3)),
split = c(1,1,2,1), more = TRUE)
print(wireframe(soi.fit ~ soi.grid$easting * soi.grid$northing,
cuts = 9,
aspect=diff(range(soi.grid$n))/diff(range(soi.grid$e)),
xlab = "Easting (km)",
ylab = "Northing (km)", shade=T,
screen = list(z = 40, x = -70, y = 0)),
split = c(2,1,2,1))
trellis.par.set(list(axis.line = list(col = ocol), clip = list(panel = oclip)))
trellis.par.set(tp)
```
## Some Calibration Experiments
### Simple Linear Stucture
Linear structure with normally distributed data:
```{r}
n <- 1000
d1 <- data.frame(x = rnorm(n), y = rnorm(n))
d1 <- mutate(d1, z = x + y)
```
```{r, eval = TRUE}
splom(d1)
xyplot(z ~ x | equal.count(y), data = d1)
xyplot(z ~ x | equal.count(y, 12), data = d1)
xyplot(z ~ x | equal.count(y, overlap = -0.8), data = d1)
splom(d1, group = abs(x) < 0.1)
```
```{r, webgl = TRUE}
clear3d()
points3d(d1)
```
Linear structure with uniformly distributed data:
```{r}
d2 <- data.frame(x = runif(n, -1, 1), y = runif(n, -1, 1))
d2 <- mutate(d2, z = x + y)
```
```{r, eval = TRUE}
splom(d2)
xyplot(z ~ x | equal.count(y, overlap = -0.8), data = d2)
splom(d2, group = abs(x) < 0.1)
```
```{r, webgl = TRUE}
clear3d()
points3d(d2)
```
### Nonlinearity and Noise
Normal data with non-linearity and noise:
```{r}
d3 <- mutate(d1, z = cos(x) + cos(y) + 0.5 * (x + y) + rnorm(x, 0, .2))
```
```{r, eval = TRUE}
splom(d3)
xyplot(z ~ x | equal.count(y, overlap = -0.8), data = d3)
splom(d3, group = abs(x) < 0.1)
splom(d3, group = abs(x + 1) < 0.1)
```
Uniform data with non-linearity and noise:
```{r}
d4 <- mutate(d2, z = cos(x) + cos(y) + 0.5 * (x + y) + rnorm(x, 0, .1))
```
```{r, eval = TRUE}
xyplot(z ~ x | equal.count(y, 12, overlap = -0.8), data = d4)
splom(d4, group = abs(x) < 0.1)
```
Full data in the background for context:
```{r, eval = TRUE}
xyplot(z ~ x | equal.count(y, 12, overlap = -0.8), data = d4,
panel = function(x, y, ...) {
panel.xyplot(d4$x, d4$z, col = "grey")
panel.xyplot(x, y,...)
})
```
```{r, webgl = TRUE}
clear3d()
points3d(d4)
```
### More Variables
Two-dimensional views, conditioning on two variables:
```{r}
n <- 10000
d <- data.frame(x1 = rnorm(n), x2 = rnorm(n), x3 = rnorm(n))
d <- mutate(d,
x4 = as.numeric(scale(x1 + cos(x2) + sin(x3) + rnorm(n, 0, 0.1))))
```
```{r, eval = TRUE}
splom(d)
splom(~d, data = d, group = abs(x1) < 0.1)
splom(~d, data = d, group = abs(x1) < 0.1 & abs(x2) < 0.1)
```
```{r, eval = TRUE, fig.height = 8}
xyplot(x4 ~ x3 |
equal.count(x1, 4, overlap = -0.8) +
equal.count(x2, 4, overlap = -0.8),
data = d, asp = 1)
```
Three-dimensional views, conditioning on one variable:
```{r}
dd <- filter(d, abs(x1) < 0.1)[2:4]
```
```{r, webgl = TRUE}
clear3d()
points3d(dd)
```
```{r, webgl = TRUE}
clear3d()
spheres3d(dd, radius = 0.05)
```
```{r, webgl = TRUE}
clear3d()
spheres3d(dd, radius = 0.05, col = ifelse(abs(dd$x2) < 0.1, "red", "blue"))
```
```{r, webgl = TRUE}
dd1 <- filter(d, abs(x1 - 1) < 0.1)[2:4]
dd2 <- filter(d, abs(x1 + 1) < 0.1)[2:4]
clear3d()
spheres3d(dd1, radius = 0.05, color = rep("red", ncol(dd1)))
spheres3d(dd2, radius = 0.05, color = rep("blue", ncol(dd2)))
```
```{r, eval = TRUE}
ddx <- mutate(d,
v = ifelse(abs(x1 - 1) < 0.1,
1,
ifelse(abs(x1 + 1) < 0.1, 2, NA)))
ddx <- filter(ddx, ! is.na(v))
cloud(x4 ~ x2 * x3, group = v, data = ddx, auto.key = TRUE)
cloud(x4 ~ x2 * x3 | v, data = ddx)
```
## Interactive Explorations
### Tk Sliders Using `tkrplot`
A simple animation function using the tkrplot` package:
```{r}
library(tkrplot)
tkranim <- function(fun, v, lo, hi, res = 0.1, title, label) {
tt <- tktoplevel()
if (! missing(title))
tkwm.title(tt, title)
draw <- function() fun(v)
img <-tkrplot(tt, draw)
tv <- tclVar(init = v)
f<-function(...) {
vv <- as.numeric(tclvalue(tv))
if (vv != v) {
v <<- vv
tkrreplot(img)
}
}
s <- tkscale(tt, command=f, from=lo, to=hi, variable=tv,
showvalue=FALSE, resolution=res, orient="horiz")
if (! missing(label))
tkpack(img, tklabel(tt, text=label), s)
else tkpack(img,s)
}
```
Interactive conditioning for one of the calibration examples:
```{r, eval = FALSE}
fd4 <- function(v)
print(splom(~ d4, group = abs(x - v) < 0.1, data = d4))
tkranim(fd4, 0, -1, 1, 0.05)
```
Interactive slicing for the soil resistivity surface:
```{r, eval = FALSE}
f <- function(i) {
e <- eastseq[i]
idx <- which(sf$easting == e)
r <- range(sf$fit)
print(xyplot(fit ~ northing, data = sf[idx,],
ylim = r, type = "l", main = sprintf("Easting = %f", e)))
}
tkranim(f, 1, 1, 85, 1)
```
Interactive conditioning for the soil resistivity data:
```{r, eval = FALSE}
binwidth <- 0.1 ## bad idea to use a global variable
g <- function(i) {
e <- eastseq[i]
idx <- which(abs(soil$easting - e) <= binwidth)
ry <- range(soil$resistivity)
rx <- range(soil$northing)
print(xyplot(resistivity ~ northing, data = soil[idx,],
xlim = rx, ylim = ry, main = sprintf("Easting = %f", e)))
}
tkranim(g, 1, 1, 85, 1)
```
### Shiny Sliders
A version of the animation function using the
[`shiny` package](http://shiny.rstudio.com/):
```{r}
shinyAnim <- function(fun, v, lo, hi, step = NULL, label = "Frame") {
require(shiny)
shinyApp(
ui = fluidPage(
sidebarLayout(
sidebarPanel(sliderInput("n", label, lo, hi, v, step,
animate = animationOptions(100))),
mainPanel(plotOutput("plot"))
)
),
server = function(input, output, session) {
session$onSessionEnded(function() shiny::stopApp())
output$plot <- renderPlot(fun(input$n))
}
)
}
```
```{r, eval = FALSE}
shinyAnim(fd4, 0, -1, 1, 0.05)
```
```{r, eval = FALSE}
shinyAnim(f, 1, 1, 85)
```
```{r, eval = FALSE}
shinyAnim(g, 1, 1, 85, 1)
```
### Animations as Movies
To show how a plot changes with a single parameter it is possible to
pre-compute the images and assemble them into a movie.
This is easy to do in Rmarkdown, but requires the `ffmpeg` software to
be installed.
Currently, `ffmpeg` is not available on the CLAS Linux systems.
A
[simple example](http://homepage.divms.uiowa.edu/~luke/examples/anim.html)
([Rmd](http://homepage.divms.uiowa.edu/~luke/examples/anim.Rmd)).
### Linking and Brushing
Linked plots and _brushing_ were pioneered at Bell Laboratories in the
early 1980s.
* Some videos are available in the ASA Statistics Computing and Statistical Graphics Sections' [video library](http://stat-graphics.org/movies/).
* Trellis displays were developed by the same research group
[_after_ the development of brushing](http://civilstat.com/2016/10/when-static-graphs-beat-interactives/).
The [`iplots`](http://www.iplots.org/) package provides a set of basic
plot that are all linked by default.
```{r, eval = FALSE}
library(iplots)
with(soil, iplot(northing, resistivity))
```
```{r, eval = FALSE}
with(soil, iplot(easting,northing))
```
The `rggobi` package provides another framework for linked plots and
brusing within R.
```{r, eval = FALSE}
library(rggobi)
ggobi(soil)
```
Several other tools with linking/brushing support and good integration
with R are under development.
* One of the furthest along seems to be
[`rbokeh`](https://hafen.github.io/rbokeh/).
* `plotly` also provides some support for linked brushing.
## Example: Ethanol Data
The `ethanol` data frame in the `lattice` package contains data from
an experiment efficiency and emissions in small one-cylinder
engines.
The data frame contains 88 observations on three variables:
* `NOx`: Concentration of nitrogen oxides (NO and NO2) in micrograms.
* `C` Compression ratio of the engine.
* `E` Equivalence ratio, a measure of the richness of the air and
ethanol fuel mixture.
A scatterplot matrix:
```{r}
library(lattice)
splom(ethanol)
```
Conditional plots of `NOx` against `E` for each level of `C`:
```{r, eval = TRUE}
xyplot(NOx ~ E | C, data = ethanol)
ggplot(ethanol, aes(x = E, y = NOx)) + geom_point() + facet_wrap(~C)
```
A single plot with color to separate levels of `C`:
```{r, eval = TRUE}
xyplot(NOx ~ E, group = C, data = ethanol, auto.key = TRUE)
ggplot(ethanol, aes(x = E, y = NOx, color = factor(C))) + geom_point()
```
Augmented with smooths:
```{r, eval = TRUE}
xyplot(NOx ~ E, group = C, type = c("p", "smooth"), data = ethanol,
auto.key = TRUE)
ggplot(ethanol, aes(x = E, y = NOx, color = factor(C))) +
geom_point() + geom_smooth(se = FALSE)
```
Conditional plots of `NOx` against `C` given levels of `E`:
```{r, eval = TRUE}
with(ethanol, {
Equivalence.Ratio <- equal.count(E, number = 9, overlap = 0.25)
xyplot(NOx ~ C | Equivalence.Ratio,
panel = function(x, y) {
panel.xyplot(x, y)
panel.loess(x, y, span = 1)
},
aspect = 2.5,
layout = c(5, 2),
xlab = "Compression Ratio",
ylab = "NOx (micrograms/J)")
})
```
An interactive 3D view:
```{r, webgl = TRUE}
library(rgl)
clear3d()
with(ethanol,points3d(E, C / 5, NOx / 5,
size = 4, col = rep("black", nrow(ethanol))))
```
## Example: Earth Quake Locations and Magnitudes
The `quakes` data frame contains data on locations of seismic events
of magnitude 4.0 or larger in a region near Fiji.
The time frame is from 1964 to perhaps 2000. More recent data is
available from a number of sources on the web.
Some scatter plot matrices:
```{r, fig.height = 7, fig.align = "center"}
library(lattice)
splom(quakes)
splom(quakes, cex = 0.1)
splom(quakes, alpha = 0.1)
splom(quakes, panel = panel.smoothScatter)
```
Where the quakes are located:
```{r, eval = TRUE, message = FALSE}
library(maps)
map("world2", c("Fiji", "Tonga", "New Zealand"))
with(quakes, points(long, lat, col = "red", cex = 0.1))
map("world2", c("Fiji", "Tonga", "New Zealand"))
with(quakes, points(long, lat, col = heat.colors(nrow(quakes))[rank(depth)],
cex = 0.1))
map("world2",c("Fiji", "Tonga", "New Zealand"))
with(quakes, points(long, lat, col = heat.colors(nrow(quakes))[rank(depth)]))
```
Showing `depth` with grouping:
```{r, eval = TRUE}
quakes2 <- mutate(quakes, depth_cut = cut_number(depth, 3))
xyplot(lat ~ long, aspect = "iso", groups = depth_cut,
data = quakes2, auto.key = list(columns = 3, title = "Depth"))
ggplot(quakes2, aes(x = long, y = lat, color = depth_cut) ) +
geom_point() + theme_bw() + coord_map()
ggplot(quakes2, aes(x = long, y = lat, colour = depth_cut, size = mag)) +
geom_point() + theme_bw() + coord_map()
```
Showing `depth` with a trellis display:
```{r, eval = TRUE}
xyplot(lat ~ long | depth_cut, aspect = "iso", data = quakes2)
xyplot(lat ~ long | depth_cut, aspect = "iso", type=c("p","g"),
data = quakes2, auto.key = TRUE)
ggplot(quakes2, aes(x = long, y = lat)) +
geom_point(color = "blue") + facet_wrap(~depth_cut, nrow = 1) +
theme_bw() + coord_map()
```
Adding the full data for background context:
```{r, eval = TRUE}
xyplot(lat ~ long | cut(depth, 3), aspect = "iso", type=c("p","g"),
panel = function(x, y, ...) {
panel.xyplot(quakes$long, quakes$lat, col = "grey")
panel.xyplot(x, y, ...)
},
data = quakes)
## quakes does not contain the depth_cut variable used in the facet
ggplot(quakes2, aes(x = long, y = lat)) +
geom_point(data = quakes, color = "gray") +
geom_point(color = "blue") + facet_wrap(~depth_cut, nrow = 1) +
theme_bw() + coord_map()
```
3D views of `lat`, `long`, and `depth`:
```{r, eval = TRUE}
cloud(-depth ~ long * lat, data = quakes)
```
```{r, webgl = TRUE}
library(rgl)
bg3d(col = "black")
clear3d()
with(quakes, points3d(long, lat, -depth / 50, size = 2,
col = rep("white", nrow(quakes))))
```
```{r, webgl = TRUE}
clear3d()
with(quakes, points3d(long, lat, -depth / 50, size = 2,
col=heat.colors(nrow(quakes))[rank(mag)]))
```
## Visualizing 2D and 3D Density Estimates
### 2D Density Estimates
We have already seen some examples of plotting 2D density contours on
a scatter plot using `geom_density2d`.
Here is a version that explicitly creates the density estimate:
```{r}
data(geyser, package = "MASS")
geyser <- mutate(geyser, duration = lag(duration))
geyser <- filter(geyser, ! is.na(duration))
plot(waiting ~ duration, data = geyser)
sc <- function(x) as.numeric(scale(x))
de <- with(geyser, MASS::kde2d(sc(duration), sc(waiting), n = 50, h = 0.5))
contour(de)
with(geyser, points(sc(duration), sc(waiting)))
```
The density estimate can also be viewed with a perspective plot or an
interactive 3D plot:
```{r}
persp(de)
```
```{r, webgl = TRUE}
clear3d()
bg3d(col = "white")
surface3d(unique(de$x), unique(de$y), de$z, col = "red")
box3d()
axes3d()
```
### 3D Density Estimates
A 3D normal mixture:
```{r}
library(rgl)
library(misc3d)
x <- matrix(rnorm(9000), ncol = 3)
x[1001 : 2000, 2] <- x[1001 : 2000, 2] + 4
x[2001 : 3000, 3] <- x[2001 : 3000, 3] + 4
```
```{r, webgl = TRUE}
clear3d()
bg3d(col = "white")
points3d(x = x[,1], y = x[,2], z = x[,3], size = 2, color = "black")
```
```{r}
g <- expand.grid(x = seq(-4, 8.5, len = 30),
y = seq(-4, 8.5, len = 30),
z = seq(-4, 8.5, len = 30))
```
Density estimates for two bandwidths:
```{r}
d <- kde3d(x[,1], x[,2], x[,3], 0.2, 30, c(-4, 8.5))$d
d2 <- kde3d(x[,1], x[,2], x[,3], 0.5, 30, c(-4, 8.5))$d
```
3D plot of data and contour for bw = 0.2:
```{r, webgl = TRUE}
clear3d()
xv <- seq(-4, 8.5, len = 30)
points3d(x = x[,1], y = x[,2], z = x[,3], size = 2, color = "black")
contour3d(d, 20 / 3000, xv, xv, xv, color="red", alpha = 0.5, add = TRUE)
```
3D plot of data and a contour for bw = 0.5:
```{r, webgl = TRUE}
clear3d()
points3d(x = x[,1], y = x[,2], z = x[,3], size = 2, color = "black")
contour3d(d2, 20 / 3000, xv, xv, xv, color="red", alpha = 0.5, add = TRUE)
```
3D plot of several contours for bw = 0.5:
```{r, webgl = TRUE}
clear3d()
contour3d(d2, c(10, 25, 40) / 3000, xv, xv, xv,
color=c("red", "green", "blue"), alpha=c(0.2, 0.5, 0.7), add = TRUE)
```
Density contours for the `quakes` data:
```{r, webgl = TRUE}
library(misc3d)
library(rgl)
clear3d()
bg3d(col="white")
points3d(quakes$long / 22, quakes$lat / 28, -quakes$depth / 640, size=2,
col = rep("black", nrow(quakes)))
box3d(col="gray")
```
```{r, webgl = TRUE}
d <- kde3d(quakes$long,quakes$lat,-quakes$depth, n = 40)
contour3d(d$d, level = exp(-12), x = d$x / 22, y = d$y /28, z = d$z / 640,
color="green",color2="gray", scale=FALSE,add=TRUE)
```
Re-scaling depth to (approximately) match surface location scale:
```{r, webgl = TRUE}
clear3d()
points3d(quakes$long, quakes$lat, -quakes$depth / 110, size=2,
col = rep("black", nrow(quakes)))
box3d(col="gray")
```
```{r, webgl = TRUE}
d <- kde3d(quakes$long,quakes$lat,-quakes$depth, n = 40)
contour3d(d$d, level = exp(-12), x = d$x, y = d$y, z = d$z / 110,
color="green",color2="gray", scale=FALSE,add=TRUE)
```
Variant with alpha blending and two contour surfaces:
```{r, webgl = TRUE}
clear3d()
points3d(quakes$long, quakes$lat, -quakes$depth/110, size = 2)
box3d(col="gray")
d <- kde3d(quakes$long,quakes$lat,-quakes$depth, n = 40)
contour3d(d$d, level = exp(c(-10, -12)), x = d$x, y = d$y, z = d$z / 110,
color=c("red", "green"), alpha = 0.1, scale=FALSE,add=TRUE)
```
## Example: Environmental Data
Daily measurements of ozone concentration, wind speed, temperature and
solar radiation in New York City from May to September of 1973.
This analysis is from Cleveland's _Visualizing Data_ book.
A goal is to see to what degree ozone variation can be explained by
radiation, temperature, and wind variation.
```{r}
library(lattice)
splom(environmental, pscale = 0)
histogram(environmental$ozone)
histogram(environmental$ozone^(1/3))
## (Cleveland uses cube root)
env <- mutate(environmental, ozone = ozone ^ (1 / 3))
splom(env)
```
Ozone seems to increase with temperature and decrease with wind.
Some conditional plots:
```{r, eval = TRUE, fig.height = 8}
xyplot(ozone ~ radiation | equal.count(temperature, 4) *
equal.count(wind, 4),
data = env, type = c("p", "g", "smooth"), asp = 1.5, col.line = "red")
```
```{r, eval = TRUE}
xyplot(ozone ~ temperature | equal.count(radiation, 3, overlap = 0) *
equal.count(wind, 2, overlap = 0.2),
data = env, type = c("p", "g", "smooth"))
```
```{r, eval = TRUE}
xyplot(ozone ~ temperature | equal.count(radiation, 4, overlap = 0),
group = ifelse(wind >= 10, "high", "low"),
data = env, auto.key=TRUE, type = c("p", "g", "smooth"))
```
3D scatter plot with wind split in two levels, `red` for low wind,
`blue` for high wind:
```{r}
ne <- names(env)
wcut <- with(env, cut_number(wind, 2))
idx <- as.integer(wcut)
cols <- c("red", "blue")[idx]
```
```{r, webgl = TRUE}
library(rgl)
clear3d()
points3d(scale(env[-4]), radius = 0.05, col = cols)
axes3d()
title3d(xlab = ne[1], ylab = ne[2], zlab = ne[3])
```
Fitting a `loess` surface:
```{r}
fit.env <- loess(ozone ~ wind * temperature * radiation,
span = 1.0, degree = 2, data = env)
scaleBy <- function(x, y) (x - mean(y)) / sd(y)
```
Visualising the surface:
```{r}
tmesh <- with(env, do.breaks(range(temperature), 50))
rmesh <- with(env, do.breaks(range(radiation), 50))
wmesh <- with(env, do.breaks(range(wind), 50))
grid <- expand.grid(temperature = tmesh, radiation = rmesh, wind = wmesh)
grid$fit <- as.numeric(predict(fit.env, newdata = grid))
gridw <- expand.grid(temperature = tmesh, radiation = rmesh, wind = 8)
fit.pred1 <- predict(fit.env, newdata = mutate(gridw, wind = 8))
fit.pred2 <- predict(fit.env, newdata = mutate(gridw, wind = 11))
gridw$fit1 <- as.numeric(fit.pred1)
gridw$fit2 <- as.numeric(fit.pred2)
```
```{r, webgl = TRUE}
library(rgl)
clear3d()
with(env,
surface3d(scaleBy(unique(gridw$temperature), temperature),
scaleBy(unique(gridw$radiation), radiation),
scaleBy(gridw$fit1, ozone), col = "red"))
with(env,
surface3d(scaleBy(unique(gridw$temperature), temperature),
scaleBy(unique(gridw$radiation), radiation),
scaleBy(gridw$fit2, ozone), col = "blue"))
axes3d()
box3d()
title3d(xlab = "temp", ylab = "rad", zlab = "ozone")
```
Conditional plots of the fit surface:
```{r, eval = TRUE, fig.height = 8}
g <- with(env,
expand.grid(radiation = do.breaks(range(radiation), 50),
temperature = do.breaks(range(temperature), 3),
wind = do.breaks(range(wind), 3)))
g$fit <- as.numeric(predict(fit.env, newdata = g))
xyplot(fit ~ radiation| temperature * wind, data = g,
type = c("g", "l"), asp = 1.5)
```
Cleveland points out that there is no data for high temperature/high
wind or low temperature/low wind, to estimates in those regions are
unreliable:
```{r, eval = TRUE}
xyplot(wind ~ temperature, data = env)
```
Setting the corresponding fit values to `NA` emphasizes the more
reliable results:
```{r, eval = TRUE, fig.height = 8}
## Cleveland Fig 5.9
gg <- g
gg$fit[gg$temperature > 80 & gg$wind > 20] <- NA
gg$fit[gg$temperature < 80 & gg$wind < 5] <- NA
xyplot(fit ~ radiation| temperature * wind, data = gg,
type = c("g", "l"), asp = 1.5)
```
Residuals plots:
```{r, eval = TRUE}
env$fit <- predict(fit.env)
env$res <- env$ozone - env$fit
xyplot(res ~ fit, data = env)
```
Spread-location plots:
```{r, eval = TRUE}
xyplot(sqrt(abs(res)) ~ fit, data = env)
```
Residual and fit spread (rfs) plots
```{r, eval = TRUE}
rfs(fit.env)
```
## Medical Imaging Data
Medical imaging technologies, such as
[magnetic resonance imaging (MRI)](https://en.wikipedia.org/wiki/Magnetic_resonance_imaging)
produce a three-dimensional array of _intensity values_, one for each
_volume element_, or _voxel_.
A common visualization allows the viewer to interactively examine
image plots of the intensities for different slices through the
volume.
```{r, eval = FALSE}
readVol <- function(fname, dim = c(181, 217, 181)) {
f<-gzfile(fname, open="rb")
on.exit(close(f))
b<-readBin(f,"integer",prod(dim),size = 1, signed = FALSE)
array(b, dim)
}
imgfile <-
"/group/statsoft/data/Brainweb/images/T1/t1_icbm_normal_1mm_pn3_rf20.rawb.gz"
v <- readVol(imgfile)
library(misc3d)
slices3d(v)
```
The interactive heatmap can also be used for functions of three
variables, such as a density estimate:
```{r, eval = FALSE}
slices3d(d2)
```
## Notes
### What to Look for in a Scatter Plot
The easiest features to see in a acatterplot:
- outliers, clusters (zero-dimensional objects)
- curves (one-dimensional objects)
Changes in point density are also perceivable, but much less easily.
### Conditioning and Projection
Scatter plots show a two-dimensional projection of the data.
- Projections preserve dimension of zero and one dimensional objects.
- Two and higher dimensional objects are collapsed to two dimensions.
Zero and one dimensional objects can be occluded in a projection by
higher-dimensional objects.
Conditioning can reduce dimensionality:
- conditioning on one variable reduces dimensionality by one;
- conditioning on two variables reduces dimensionality by two.
Conditioning on more than two variables is rarely practical.
Many different projections are possible. Some useful tools for exploring:
- 3D visualizations;
- scatter plot matrices;
- principal components analysis;
- grand and guided tours;
- scagnostics.
### 3D Visualizations
- 3D visualizations take advantage of our visual system's ability to
create a 3D mental image from a 2D rendering.
- Several techniques are used to help our visual system:
- motion;
- depth cueing;
- perspective projection.
- For viewing data it is often helpful to turn off the perspective
adjustment.
- Most systems for 3D viewing provide a way to do this.
An illustration is provided by the `randu` data set of 400 triples of
successive random numbers from the
[RANDU](https://en.wikipedia.org/wiki/RANDU) pseudo-random number
generator. The flaw is much easier to see then the perspective
adjustment is removed.
```{r, webgl = TRUE}
library(rgl)
clear3d()
points3d(randu, size = 5)
```
```{r, webgl = TRUE}
par3d(FOV=1) ## removes perspective distortion
```
## Two Classification Problems
Two data sets where the objective is to identify class membership from
measurements
- Australian Crabs
- Olive Oils
### Australian Crabs
The data frame `crabs` in the `MASS` package contains measurement on crabs
of a species that has been split into two based on color, orange or blue.
Preserved specimens lose their color (and possibly ability to identify
sex).
Data were collected to help classify preserved specimens. It would be
useful to have a fairly simple classification rule.
An initial view:
```{r, fig.height = 7, fig.align = "center"}
data(crabs, package = "MASS")
splom(~ crabs[4:8], group = paste(sex, sp),
data = crabs, auto.key = TRUE, pscale = 0)
```
- The variables are highly correlated, reflecting overall size and age.
- The `RW * CL` or `RW * CW` plots separate males and females well, at
least for larger crabs.
Some possible next steps:
- Consider a `log` transformation if the marginal distributions are skewed.
- Reduce the correlation by a linear or nonlinear transformation.
A quick look at histograms:
```{r}
library(tidyr)
library(ggplot2)
ggallhists <- function(data, nb = nclass.Sturges(data[[1]])) {
ggplot(gather(data, var, val), aes(val)) +
geom_histogram(bins = nb) + xlab("") + ylab("") +
facet_wrap(~ var, scales = "free")
}
ggallhists(crabs[4:8])
```
- It does not look like a `log` transformation is needed.
Principal component analysis identifies a rotation of the data that
produces uncorrelated _scores_.
```{r, fig.height = 7, fig.align = "center"}
pc <- princomp(crabs[4:8])
splom(~ pc$scores, group = paste(sex, sp),
data = crabs, auto.key = TRUE, pscale = 0)
```
- There is some separation of the classes in the plot of the second
and third components.
- A nonlinear transformation that scales relative to one of the
variables, say `CL`, may do better.
```{r, fig.height = 7, fig.align = "center"}
cr <- mutate(crabs, FLCL = FL/CL, RWCL = RW/CL, CWCL = CW/CL, BDCL = BD/CL)
splom(~ cr[9:12], group = paste(sex, sp),
data = cr, auto.key = TRUE, pscale = 0)
```
- The `CWCL * BDCL` plot shows good species separation by a line.
- Using `CWCL - BDCL` will pick out the axis of separation.
```{r, fig.height = 7, fig.align = "center"}
splom(~ data.frame(FLCL, RWCL, CWCL - BDCL), group = paste(sex, sp),
data = cr, auto.key = TRUE, pscale = 0)
```
- The `FLCL * (CWCL - BDCL)` plot shows perfect separation.
- After another rotation to `FLCL - (CWCL - BDCL) = FLCL + BDCL - CWCL`
a simple criterion emerges.
```{r}
stripplot(paste(sex, sp) ~ FLCL + BDCL - CWCL, data = cr,
jitter = TRUE, group = paste(sex, sp),
panel = function(...) {
panel.stripplot(...)
panel.abline(v = -0.23, lty = 2)
})
```
- The sexes are well separated by `RWCL` for larger crabs; less so for
smaller ones
```{r}
xyplot(RWCL ~ CL | sp, data = cr, group = paste(sex), auto.key=TRUE)
```
- Dividing the crabs at `CL` of 30 into small and large gives a simple
rule that works fairly well across species and sizes.
```{r}
xyplot(RWCL ~ CL | sp, data = cr, group = paste(sex), auto.key=TRUE,
panel = function(...) {
panel.xyplot(...)
## panel.segments(c(0, 30), c(0.42, 0.39), c(30, 60), c(0.42, 0.39),
## lty = 2)
ds <- 0.42
dl <- 0.39
panel.lines(c(0, 30, 30, 60), c(ds, ds, dl, dl),
lty = 2, col = "black")
})
```
- Using principle component analysis on the scaled data produces a
similar result based on the first two principal components.
```{r, fig.height = 7, fig.align = "center"}
pc <- princomp(cr[9:12])
splom(~ pc$scores, group = paste(sex, sp),
data = cr, auto.key = TRUE, pscale = 0)
```
- When designing a classifier it is usually a good idea to split the
data into a _training set_ and a _test set.
- After building the classifier with the training data its performance
can be assessed using the test data.
```{r, include = FALSE}
## This reproduces what biplot() does and could be used to make a
## ggplot version. The ggbiplot function in ggfortify does something
## different with the loadings.
bp <- function (pc) {
choices <- 1:2
n <- pc$n.obs
scores <- pc$scores[, choices]
loadings <- pc$loadings[, choices]
lam <- pc$sdev[choices]
lam <- lam * sqrt(n)
scores <- t(t(scores) / lam)
vars <- t(t(loadings) * lam)
plot(scores)
vars <- vars * max(abs(scores)) / max(abs(vars)) * 0.8
arrows(0, 0, vars[,1], vars[, 2], col = "red")
text(vars, rownames(loadings), col = "red")
invisible()
}
```
### Olive Oils
The `olives` data frame in the `extracat` package contains
measurements on the proportion of 8 fatty acids (% times 100) in olive
oil samples.
- Samples come from 9 areas grouped in three regions.
![](img/olive-oils-map.png)
- A goal is to see if region and perhaps area can be determined from
these measurements.
- This might help in discovering cheap varieties masquerading as
expensive ones.
- A useful approach is to first try to identify the regions and then
the areas within regions.
A first look at the data:
```{r, fig.height = 8, fig.align = "center"}
library(lattice)
data(olives, package = "extracat")
splom(~olives[3:10], cex = 0.1, group = Region,
data = olives, auto.key = TRUE, pscale = 0)
```
- Oils from the `South` region have a much higher `eicosenoic` content
than oils from the other regions.
```{r}
stripplot(Region ~ eicosenoic, data = olives, type = c("p", "g"),
jitter = TRUE, group = Region)
```
- To separate `North` from `Sardinia` we can remove the `South` samples.
```{r, fig.height = 8, fig.align = "center"}
ons <- filter(olives, Region != "South")
ons <- droplevels(ons)
splom(~ons[3:10], cex = 0.1, group = Region, data = ons,
auto.key = TRUE, pscale = 0)
```
- The `arachidic * linoleic` plot allows these two regions to be
separated:
```{r}
xyplot(arachidic ~ linoleic, group = Region, data = ons,
type = c("p", "g"), auto.key = TRUE)
```
- Among the non-`South` oils the ones from `Sardinia` can be
identified as the ones with `linoleic > 1000` and `arachidic > 40`.
Looking at the oils for the two areas within Sardinia:
```{r, fig.height = 8, fig.align = "center"}
sar <- filter(olives, Region == "Sardinia")
sar <- droplevels(sar)
splom(~sar[3:10], cex = 0.1, group = Area,
data = sar, auto.key = TRUE, pscale = 0)
```
- Within `Sardinia` the `oleic * linoleic` plot suggests looking at
`oleic - linoleic`
```{r}
stripplot(Area ~ oleic - linoleic, data = sar, jitter = TRUE,
type = c("p", "g"))
```
The inland oils are identified by having `oleic - linoleic > 6000`.
## Grand and Guided Tours
An alternative approach is to use `rggobi` to explore the data with
_grand tours_ and _guided tours_.
- The _grand tour_ is a framework for animating smoothly through a
sequence of randomly chosen projections.
- A _guided tour_ encourages the tour to move towards projections that
are more interesting according to one of several criteria.
- The `rggobi` approach is documented in
> Cook, Dianne and Swayne, Deborah F. (2007), _Interactive and Dynamic
> Graphics for Data Analysis_ with R and GGobi_, Springer Verlag.
To explore this approach with the scaled `crabs` data:
```{r, eval = FALSE}
library(rggobi)
ggobi(cr)
```
## Scagnostics
- With a large number of variables it isn't realistic to view all
possible pairwise scatter plots.
- This becomes even more of an issue with automated projection systems
like the grand tour.
- _Scagnostics_, or scatter plot diagnostics, are numerical summaries
of interesting features in a scatter plot.
- The projection pursuit indices used in the guided tour are a form of
scagnostic.
- One possible use is to automatically compute sacgnostics for all
possible pairwise scatter plots and view the ones with high scores.
- A [German Wikipedia article](https://de.wikipedia.org/wiki/Scagnostics)
has some graphics that illustrate the proposed diagnostics.
![](https://upload.wikimedia.org/wikipedia/commons/thumb/c/c7/ScagnosticsConvex.svg/1024px-ScagnosticsConvex.svg.png)
![](https://upload.wikimedia.org/wikipedia/commons/thumb/0/02/ScagnosticsClumpy.svg/1024px-ScagnosticsClumpy.svg.png)
- Some references:
> Wilkinson, Leland, Anushka Anand, and Robert
> L. Grossman. "Graph-theoretic scagnostics." INFOVIS. Vol. 5. 2005.
> Dang, Tuan Nhon, and Leland Wilkinson. "Scagexplorer: Exploring
> scatterplots by their scagnostics." Visualization Symposium
> (PacificVis), 2014 IEEE Pacific. IEEE, 2014.
- An R package implementing these ideas is available.
```{r, fig.height = 8, fig.align = "center"}
library(scagnostics)
s <- scagnostics(olives[3:10])
heatmap(s)
colnames(s)[max.col(s)]
splom(t(unclass(s)), varname.cex = 0.7, pscale = 0)
```
## Two Sliders Using Shiny
This function creates a `shiny` animation using two sliders:
```{r, eval = FALSE}
## two-parameter shiny
## performance is not good
shinyAnim2 <- function(fun, v, lo, hi, step = NULL, label = "Frame", fps = 4) {
require(shiny)
msecs <- 1000 / fps
shinyApp(
ui = fluidPage(
sidebarLayout(
sidebarPanel(
sliderInput("v1", label[1], lo[1], hi[1], v[1], step[1],
animate = animationOptions(msecs)),
sliderInput("v2", label[2], lo[2], hi[2], v[2], step[2],
animate = animationOptions(msecs))),
mainPanel(plotOutput("plot"))
)
),
server = function(input, output, session) {
session$onSessionEnded(function() shiny::stopApp())
output$plot <- renderPlot(fun(input$v1, input$v2))
})
}
```
Some calibration data:
```{r, eval = FALSE}
library(tidyverse)
library(shiny)
library(lattice)
library(ggplot2)
n <- 10000
d <- data.frame(x1 = runif(n, -1, 1), x2 = runif(n, -1, 1), x3 = rnorm(n))
d <- mutate(d,
x4 = as.numeric(scale(x1 + cos(x2) + sin(x3) + rnorm(n, 0, 0.1))))
```
Animation using `splom`:
```{r, eval = FALSE}
fd <- function(v1, v2)
splom(~d, data = d, group = abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1)
shinyAnim2(fd, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05))
```
Using `xyplot`:
```{r, eval = FALSE}
fd2 <- function(v1, v2)
xyplot(x4 ~ x3, group = abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1,
data = d, pch=19)
shinyAnim2(fd2, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05))
```
Using `ggplot`:
```{r, eval = FALSE}
fd3 <- function(v1, v2)
ggplot(d, aes(x = x3, y = x4)) + geom_point(color = "grey") +
geom_point(data = filter(d, abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1),
color = "red")
shinyAnim2(fd3, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05))
```
Rendering of `ggplot` and `lattice` graphics is slow and not ideal for
animations; using base graphics helps a little:
```{r, eval = FALSE}
fd4 <- function(v1, v2) {
plot(x4 ~ x3, data = d, pch = 19, col = "blue", asp = 1)
with(filter(d, abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1),
points(x3, x4, col = "red", pch = 19))
}
shinyAnim2(fd4, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05))
```