Other options for viewing a 3D scene include:
A stereogram
This approach is or was used
You can create your own viewer with paper or cardboard tubes, for example.
A stereo image for the soil data surface:
tp <- trellis.par.get()
trellis.par.set(theme = col.whitebg())
ocol <- trellis.par.get("axis.line")$col
oclip <- trellis.par.get("clip")$panel
trellis.par.set(list(axis.line = list(col = "transparent"),
clip = list(panel = FALSE)))
print(wireframe(soi.fit ~ soi.grid$easting * soi.grid$northing,
cuts = 9,
aspect = diff(range(soi.grid$n)) / diff(range(soi.grid$e)),
xlab = "Easting (km)",
ylab = "Northing (km)", shade = TRUE,
screen = list(z = 40, x = -70, y = 3)),
split = c(1, 1, 2, 1), more = TRUE)
print(wireframe(soi.fit ~ soi.grid$easting * soi.grid$northing,
cuts = 9,
aspect = diff(range(soi.grid$n)) / diff(range(soi.grid$e)),
xlab = "Easting (km)",
ylab = "Northing (km)", shade = TRUE,
screen = list(z = 40, x = -70, y = 0)),
split = c(2, 1, 2, 1))
trellis.par.set(list(axis.line = list(col = ocol), clip = list(panel = oclip)))
trellis.par.set(tp)
Linear structure with normally distributed data:
n <- 1000
d1 <- data.frame(x = rnorm(n), y = rnorm(n))
d1 <- mutate(d1, z = x + y)
splom(d1)
xyplot(z ~ x | equal.count(y), data = d1)
xyplot(z ~ x | equal.count(y, 12), data = d1)
xyplot(z ~ x | equal.count(y, overlap = -0.8), data = d1)
splom(d1, group = abs(x) < 0.1)
clear3d()
points3d(d1)
Linear structure with uniformly distributed data:
d2 <- data.frame(x = runif(n, -1, 1), y = runif(n, -1, 1))
d2 <- mutate(d2, z = x + y)
splom(d2)
xyplot(z ~ x | equal.count(y, overlap = -0.8), data = d2)
splom(d2, group = abs(x) < 0.1)
clear3d()
points3d(d2)
Normal data with non-linearity and noise:
d3 <- mutate(d1, z = cos(x) + cos(y) + 0.5 * (x + y) + rnorm(x, 0, .2))
splom(d3)
xyplot(z ~ x | equal.count(y, overlap = -0.8), data = d3)
splom(d3, group = abs(x) < 0.1)
splom(d3, group = abs(x + 1) < 0.1)
Uniform data with non-linearity and noise:
d4 <- mutate(d2, z = cos(x) + cos(y) + 0.5 * (x + y) + rnorm(x, 0, .1))
xyplot(z ~ x | equal.count(y, 12, overlap = -0.8), data = d4)
splom(d4, group = abs(x) < 0.1)
Full data in the background for context:
xyplot(z ~ x | equal.count(y, 12, overlap = -0.8), data = d4,
panel = function(x, y, ...) {
panel.xyplot(d4$x, d4$z, col = "grey")
panel.xyplot(x, y, ...)
})
clear3d()
points3d(d4)
Two-dimensional views, conditioning on two variables:
n <- 10000
d <- data.frame(x1 = rnorm(n), x2 = rnorm(n), x3 = rnorm(n))
d <- mutate(d,
x4 = as.numeric(scale(x1 + cos(x2) + sin(x3) + rnorm(n, 0, 0.1))))
splom(d)
splom(~ d, data = d, group = abs(x1) < 0.1)
splom(~ d, data = d, group = abs(x1) < 0.1 & abs(x2) < 0.1)
xyplot(x4 ~ x3 |
equal.count(x1, 4, overlap = -0.8) +
equal.count(x2, 4, overlap = -0.8),
data = d, asp = 1)
Three-dimensional views, conditioning on one variable:
dd <- filter(d, abs(x1) < 0.1)[2 : 4]
clear3d()
points3d(dd)
clear3d()
spheres3d(dd, radius = 0.05)
clear3d()
spheres3d(dd, radius = 0.05, col = ifelse(abs(dd$x2) < 0.1, "red", "blue"))
dd1 <- filter(d, abs(x1 - 1) < 0.1)[2 : 4]
dd2 <- filter(d, abs(x1 + 1) < 0.1)[2 : 4]
clear3d()
spheres3d(dd1, radius = 0.05, color = rep("red", ncol(dd1)))
spheres3d(dd2, radius = 0.05, color = rep("blue", ncol(dd2)))
ddx <- mutate(d,
v = ifelse(abs(x1 - 1) < 0.1,
1,
ifelse(abs(x1 + 1) < 0.1, 2, NA)))
ddx <- filter(ddx, ! is.na(v))
cloud(x4 ~ x2 * x3, group = v, data = ddx, auto.key = TRUE)
cloud(x4 ~ x2 * x3 | v, data = ddx)
tkrplot
A simple animation function using the tkrplot` package:
library(tkrplot)
## Loading required package: tcltk
tkranim <- function(fun, v, lo, hi, res = 0.1, title, label) {
tt <- tktoplevel()
if (! missing(title))
tkwm.title(tt, title)
draw <- function() fun(v)
img <- tkrplot(tt, draw)
tv <- tclVar(init = v)
f <- function(...) {
vv <- as.numeric(tclvalue(tv))
if (vv != v) {
v <<- vv
tkrreplot(img)
}
}
s <- tkscale(tt, command = f, from = lo, to = hi, variable = tv,
showvalue = FALSE, resolution = res, orient = "horiz")
if (! missing(label))
tkpack(img, tklabel(tt, text = label), s)
else tkpack(img, s)
}
Interactive conditioning for one of the calibration examples:
fd4 <- function(v)
print(splom(~ d4, group = abs(x - v) < 0.1, data = d4))
tkranim(fd4, 0, -1, 1, 0.05)
Interactive slicing for the soil resistivity surface:
f <- function(i) {
e <- eastseq[i]
idx <- which(sf$easting == e)
r <- range(sf$fit)
print(xyplot(fit ~ northing, data = sf[idx, ],
ylim = r, type = "l", main = sprintf("Easting = %f", e)))
}
tkranim(f, 1, 1, 85, 1)
Interactive conditioning for the soil resistivity data:
binwidth <- 0.1 ## bad idea to use a global variable
g <- function(i) {
e <- eastseq[i]
idx <- which(abs(soil$easting - e) <= binwidth)
ry <- range(soil$resistivity)
rx <- range(soil$northing)
print(xyplot(resistivity ~ northing, data = soil[idx, ],
xlim = rx, ylim = ry, main = sprintf("Easting = %f", e)))
}
tkranim(g, 1, 1, 85, 1)
A version of the animation function using the shiny
package:
shinyAnim <- function(fun, v, lo, hi, step = NULL, label = "Frame") {
require(shiny)
shinyApp(
ui = fluidPage(
sidebarLayout(
sidebarPanel(sliderInput("n", label, lo, hi, v, step,
animate = animationOptions(100))),
mainPanel(plotOutput("plot"))
)
),
server = function(input, output, session) {
session$onSessionEnded(function() shiny::stopApp())
output$plot <- renderPlot(fun(input$n))
}
)
}
shinyAnim(fd4, 0, -1, 1, 0.05)
shinyAnim(f, 1, 1, 85)
shinyAnim(g, 1, 1, 85, 1)
To show how a plot changes with a single parameter it is possible to pre-compute the images and assemble them into a movie.
This is easy to do in Rmarkdown, but requires the ffmpeg
software to be installed.
Currently, ffmpeg
is not available on the CLAS Linux systems.
A simple example (Rmd).
Linked plots and brushing were pioneered at Bell Laboratories in the early 1980s.
Some videos are available in the ASA Statistics Computing and Statistical Graphics Sections’ video library.
Trellis displays were developed by the same research group after the development of brushing.
The iplots
package provides a set of basic plot that are all linked by default.
library(iplots)
with(soil, iplot(northing, resistivity))
with(soil, iplot(easting, northing))
The rggobi
package provides another framework for linked plots and brusing within R.
library(rggobi)
ggobi(soil)
Several other tools with linking/brushing support and good integration with R are under development.
rbokeh
.plotly
also provides some support for linked brushing.The ethanol
data frame in the lattice
package contains data from an experiment efficiency and emissions in small one-cylinder engines.
The data frame contains 88 observations on three variables:
NOx
: Concentration of nitrogen oxides (NO and NO2) in micrograms.
C
Compression ratio of the engine.
E
Equivalence ratio, a measure of the richness of the air and ethanol fuel mixture.
A scatterplot matrix:
library(lattice)
splom(ethanol)
Conditional plots of NOx
against E
for each level of C
:
xyplot(NOx ~ E | C, data = ethanol)
ggplot(ethanol, aes(x = E, y = NOx)) + geom_point() + facet_wrap(~ C)
A single plot with color to separate levels of C
:
xyplot(NOx ~ E, group = C, data = ethanol, auto.key = TRUE)
ggplot(ethanol, aes(x = E, y = NOx, color = factor(C))) + geom_point()
Augmented with smooths:
xyplot(NOx ~ E, group = C, type = c("p", "smooth"), data = ethanol,
auto.key = TRUE)
ggplot(ethanol, aes(x = E, y = NOx, color = factor(C))) +
geom_point() + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Conditional plots of NOx
against C
given levels of E
:
with(ethanol, {
Equivalence.Ratio <- equal.count(E, number = 9, overlap = 0.25)
xyplot(NOx ~ C | Equivalence.Ratio,
panel = function(x, y) {
panel.xyplot(x, y)
panel.loess(x, y, span = 1)
},
aspect = 2.5,
layout = c(5, 2),
xlab = "Compression Ratio",
ylab = "NOx (micrograms/J)")
})
An interactive 3D view:
library(rgl)
clear3d()
with(ethanol, points3d(E, C / 5, NOx / 5,
size = 4, col = rep("black", nrow(ethanol))))
The quakes
data frame contains data on locations of seismic events of magnitude 4.0 or larger in a region near Fiji.
The time frame is from 1964 to perhaps 2000. More recent data is available from a number of sources on the web.
Some scatter plot matrices:
library(lattice)
splom(quakes)
splom(quakes, cex = 0.1)
splom(quakes, alpha = 0.1)
splom(quakes, panel = panel.smoothScatter)
## (loaded the KernSmooth namespace)
Where the quakes are located:
library(maps)
map("world2", c("Fiji", "Tonga", "New Zealand"))
with(quakes, points(long, lat, col = "red", cex = 0.1))
map("world2", c("Fiji", "Tonga", "New Zealand"))
with(quakes, points(long, lat, col = heat.colors(nrow(quakes))[rank(depth)],
cex = 0.1))
map("world2", c("Fiji", "Tonga", "New Zealand"))
with(quakes, points(long, lat, col = heat.colors(nrow(quakes))[rank(depth)]))
Showing depth
with grouping:
quakes2 <- mutate(quakes, depth_cut = cut_number(depth, 3))
xyplot(lat ~ long, aspect = "iso", groups = depth_cut,
data = quakes2, auto.key = list(columns = 3, title = "Depth"))
ggplot(quakes2, aes(x = long, y = lat, color = depth_cut)) +
geom_point() + theme_bw() + coord_map()
ggplot(quakes2, aes(x = long, y = lat, colour = depth_cut, size = mag)) +
geom_point() + theme_bw() + coord_map()
Showing depth
with a trellis display:
xyplot(lat ~ long | depth_cut, aspect = "iso", data = quakes2)
xyplot(lat ~ long | depth_cut, aspect = "iso", type = c("p", "g"),
data = quakes2, auto.key = TRUE)
ggplot(quakes2, aes(x = long, y = lat)) +
geom_point(color = "blue") + facet_wrap(~ depth_cut, nrow = 1) +
theme_bw() + coord_map()
Adding the full data for background context:
xyplot(lat ~ long | cut(depth, 3), aspect = "iso", type = c("p", "g"),
panel = function(x, y, ...) {
panel.xyplot(quakes$long, quakes$lat, col = "grey")
panel.xyplot(x, y, ...)
},
data = quakes)
## quakes does not contain the depth_cut variable used in the facet
ggplot(quakes2, aes(x = long, y = lat)) +
geom_point(data = quakes, color = "gray") +
geom_point(color = "blue") + facet_wrap(~ depth_cut, nrow = 1) +
theme_bw() + coord_map()
3D views of lat
, long
, and depth
:
cloud(-depth ~ long * lat, data = quakes)
library(rgl)
bg3d(col = "black")
clear3d()
with(quakes, points3d(long, lat, -depth / 50, size = 2,
col = rep("white", nrow(quakes))))
clear3d()
with(quakes, points3d(long, lat, -depth / 50, size = 2,
col = heat.colors(nrow(quakes))[rank(mag)]))
We have already seen some examples of plotting 2D density contours on a scatter plot using geom_density2d
.
Here is a version that explicitly creates the density estimate:
data(geyser, package = "MASS")
geyser <- mutate(geyser, duration = lag(duration))
geyser <- filter(geyser, ! is.na(duration))
plot(waiting ~ duration, data = geyser)
sc <- function(x) as.numeric(scale(x))
de <- with(geyser, MASS::kde2d(sc(duration), sc(waiting), n = 50, h = 0.5))
contour(de)
with(geyser, points(sc(duration), sc(waiting)))
The density estimate can also be viewed with a perspective plot or an interactive 3D plot:
persp(de)
clear3d()
bg3d(col = "white")
surface3d(unique(de$x), unique(de$y), de$z, col = "red")
box3d()
axes3d()
A 3D normal mixture:
library(rgl)
library(misc3d)
x <- matrix(rnorm(9000), ncol = 3)
x[1001 : 2000, 2] <- x[1001 : 2000, 2] + 4
x[2001 : 3000, 3] <- x[2001 : 3000, 3] + 4
clear3d()
bg3d(col = "white")
points3d(x = x[, 1], y = x[, 2], z = x[, 3], size = 2, color = "black")
g <- expand.grid(x = seq(-4, 8.5, len = 30),
y = seq(-4, 8.5, len = 30),
z = seq(-4, 8.5, len = 30))
Density estimates for two bandwidths:
d <- kde3d(x[, 1], x[, 2], x[, 3], 0.2, 30, c(-4, 8.5))$d
d2 <- kde3d(x[, 1], x[, 2], x[, 3], 0.5, 30, c(-4, 8.5))$d
3D plot of data and contour for bw = 0.2:
clear3d()
xv <- seq(-4, 8.5, len = 30)
points3d(x = x[, 1], y = x[, 2], z = x[, 3], size = 2, color = "black")
contour3d(d, 20 / 3000, xv, xv, xv, color = "red", alpha = 0.5, add = TRUE)
3D plot of data and a contour for bw = 0.5:
clear3d()
points3d(x = x[, 1], y = x[, 2], z = x[, 3], size = 2, color = "black")
contour3d(d2, 20 / 3000, xv, xv, xv, color = "red", alpha = 0.5, add = TRUE)
3D plot of several contours for bw = 0.5:
clear3d()
contour3d(d2, c(10, 25, 40) / 3000, xv, xv, xv,
color = c("red", "green", "blue"), alpha = c(0.2, 0.5, 0.7),
add = TRUE)
Density contours for the quakes
data:
library(misc3d)
library(rgl)
clear3d()
bg3d(col = "white")
points3d(quakes$long / 22, quakes$lat / 28, -quakes$depth / 640, size = 2,
col = rep("black", nrow(quakes)))
box3d(col = "gray")
d <- kde3d(quakes$long, quakes$lat, -quakes$depth, n = 40)
contour3d(d$d, level = exp(-12), x = d$x / 22, y = d$y / 28, z = d$z / 640,
color = "green", color2 = "gray", scale = FALSE, add = TRUE)
Re-scaling depth to (approximately) match surface location scale:
clear3d()
points3d(quakes$long, quakes$lat, -quakes$depth / 110, size = 2,
col = rep("black", nrow(quakes)))
box3d(col = "gray")
d <- kde3d(quakes$long, quakes$lat, -quakes$depth, n = 40)
contour3d(d$d, level = exp(-12), x = d$x, y = d$y, z = d$z / 110,
color = "green", color2 = "gray", scale = FALSE, add = TRUE)
Variant with alpha blending and two contour surfaces:
clear3d()
points3d(quakes$long, quakes$lat, -quakes$depth / 110, size = 2)
box3d(col = "gray")
d <- kde3d(quakes$long, quakes$lat, -quakes$depth, n = 40)
contour3d(d$d, level = exp(c(-10, -12)), x = d$x, y = d$y, z = d$z / 110,
color = c("red", "green"), alpha = 0.1, scale = FALSE, add = TRUE)
Daily measurements of ozone concentration, wind speed, temperature and solar radiation in New York City from May to September of 1973.
This analysis is from Cleveland’s Visualizing Data book.
A goal is to see to what degree ozone variation can be explained by radiation, temperature, and wind variation.
library(lattice)
splom(environmental, pscale = 0)
histogram(environmental$ozone)
histogram(environmental$ozone ^ (1 / 3))
## (Cleveland uses cube root)
env <- mutate(environmental, ozone = ozone ^ (1 / 3))
splom(env)
Ozone seems to increase with temperature and decrease with wind.
Some conditional plots:
xyplot(ozone ~ radiation | equal.count(temperature, 4) *
equal.count(wind, 4),
data = env, type = c("p", "g", "smooth"), asp = 1.5, col.line = "red")
xyplot(ozone ~ temperature |
equal.count(radiation, 3, overlap = 0) *
equal.count(wind, 2, overlap = 0.2),
data = env, type = c("p", "g", "smooth"))
xyplot(ozone ~ temperature | equal.count(radiation, 4, overlap = 0),
group = ifelse(wind >= 10, "high", "low"),
data = env, auto.key = TRUE, type = c("p", "g", "smooth"))
3D scatter plot with wind split in two levels, red
for low wind, blue
for high wind:
ne <- names(env)
wcut <- with(env, cut_number(wind, 2))
idx <- as.integer(wcut)
cols <- c("red", "blue")[idx]
library(rgl)
clear3d()
points3d(scale(env[-4]), radius = 0.05, col = cols)
axes3d()
title3d(xlab = ne[1], ylab = ne[2], zlab = ne[3])
Fitting a loess
surface:
fit.env <- loess(ozone ~ wind * temperature * radiation,
span = 1.0, degree = 2, data = env)
scaleBy <- function(x, y) (x - mean(y)) / sd(y)
Visualising the surface:
tmesh <- with(env, do.breaks(range(temperature), 50))
rmesh <- with(env, do.breaks(range(radiation), 50))
wmesh <- with(env, do.breaks(range(wind), 50))
grid <- expand.grid(temperature = tmesh, radiation = rmesh, wind = wmesh)
grid$fit <- as.numeric(predict(fit.env, newdata = grid))
gridw <- expand.grid(temperature = tmesh, radiation = rmesh, wind = 8)
fit.pred1 <- predict(fit.env, newdata = mutate(gridw, wind = 8))
fit.pred2 <- predict(fit.env, newdata = mutate(gridw, wind = 11))
gridw$fit1 <- as.numeric(fit.pred1)
gridw$fit2 <- as.numeric(fit.pred2)
library(rgl)
clear3d()
with(env,
surface3d(scaleBy(unique(gridw$temperature), temperature),
scaleBy(unique(gridw$radiation), radiation),
scaleBy(gridw$fit1, ozone), col = "red"))
with(env,
surface3d(scaleBy(unique(gridw$temperature), temperature),
scaleBy(unique(gridw$radiation), radiation),
scaleBy(gridw$fit2, ozone), col = "blue"))
axes3d()
box3d()
title3d(xlab = "temp", ylab = "rad", zlab = "ozone")
Conditional plots of the fit surface:
g <- with(env,
expand.grid(radiation = do.breaks(range(radiation), 50),
temperature = do.breaks(range(temperature), 3),
wind = do.breaks(range(wind), 3)))
g$fit <- as.numeric(predict(fit.env, newdata = g))
xyplot(fit ~ radiation | temperature * wind, data = g,
type = c("g", "l"), asp = 1.5)
Cleveland points out that there is no data for high temperature/high wind or low temperature/low wind, to estimates in those regions are unreliable:
xyplot(wind ~ temperature, data = env)
Setting the corresponding fit values to NA
emphasizes the more reliable results:
## Cleveland Fig 5.9
gg <- g
gg$fit[gg$temperature > 80 & gg$wind > 20] <- NA
gg$fit[gg$temperature < 80 & gg$wind < 5] <- NA
xyplot(fit ~ radiation | temperature * wind, data = gg,
type = c("g", "l"), asp = 1.5)
Residuals plots:
env$fit <- predict(fit.env)
env$res <- env$ozone - env$fit
xyplot(res ~ fit, data = env)
Spread-location plots:
xyplot(sqrt(abs(res)) ~ fit, data = env)
Residual and fit spread (rfs) plots
rfs(fit.env)
Medical imaging technologies, such as magnetic resonance imaging (MRI) produce a three-dimensional array of intensity values, one for each volume element, or voxel.
A common visualization allows the viewer to interactively examine image plots of the intensities for different slices through the volume.
readVol <- function(fname, dim = c(181, 217, 181)) {
f <- gzfile(fname, open = "rb")
on.exit(close(f))
b <- readBin(f, "integer", prod(dim), size = 1, signed = FALSE)
array(b, dim)
}
imgfile <-
"/group/statsoft/data/Brainweb/images/T1/t1_icbm_normal_1mm_pn3_rf20.rawb.gz"
v <- readVol(imgfile)
library(misc3d)
slices3d(v)
The interactive heatmap can also be used for functions of three variables, such as a density estimate:
slices3d(d2)
The easiest features to see in a acatterplot:
Changes in point density are also perceivable, but much less easily.
Scatter plots show a two-dimensional projection of the data.
Zero and one dimensional objects can be occluded in a projection by higher-dimensional objects.
Conditioning can reduce dimensionality:
Conditioning on more than two variables is rarely practical.
Many different projections are possible. Some useful tools for exploring:
3D visualizations;
scatter plot matrices;
principal components analysis;
grand and guided tours;
scagnostics.
3D visualizations take advantage of our visual system’s ability to create a 3D mental image from a 2D rendering.
Several techniques are used to help our visual system:
motion;
depth cueing;
perspective projection.
For viewing data it is often helpful to turn off the perspective adjustment.
Most systems for 3D viewing provide a way to do this.
An illustration is provided by the randu
data set of 400 triples of successive random numbers from the RANDU pseudo-random number generator. The flaw is much easier to see then the perspective adjustment is removed.
library(rgl)
clear3d()
points3d(randu, size = 5)
par3d(FOV = 1) ## removes perspective distortion
Two data sets where the objective is to identify class membership from measurements
Australian Crabs
Olive Oils
The data frame crabs
in the MASS
package contains measurement on crabs of a species that has been split into two based on color, orange or blue.
Preserved specimens lose their color (and possibly ability to identify sex).
Data were collected to help classify preserved specimens. It would be useful to have a fairly simple classification rule.
An initial view:
data(crabs, package = "MASS")
splom(~ crabs[4 : 8], group = paste(sex, sp),
data = crabs, auto.key = TRUE, pscale = 0)
The variables are highly correlated, reflecting overall size and age.
The RW * CL
or RW * CW
plots separate males and females well, at least for larger crabs.
Some possible next steps:
Consider a log
transformation if the marginal distributions are skewed.
Reduce the correlation by a linear or nonlinear transformation.
A quick look at histograms:
library(tidyr)
library(ggplot2)
ggallhists <- function(data, nb = nclass.Sturges(data[[1]])) {
ggplot(gather(data, var, val), aes(val)) +
geom_histogram(bins = nb) + xlab("") + ylab("") +
facet_wrap(~ var, scales = "free")
}
ggallhists(crabs[4 : 8])
log
transformation is needed.Principal component analysis identifies a rotation of the data that produces uncorrelated scores.
pc <- princomp(crabs[4 : 8])
splom(~ pc$scores, group = paste(sex, sp),
data = crabs, auto.key = TRUE, pscale = 0)
There is some separation of the classes in the plot of the second and third components.
A nonlinear transformation that scales relative to one of the variables, say CL
, may do better.
cr <- mutate(crabs,
FLCL = FL / CL, RWCL = RW / CL, CWCL = CW / CL, BDCL = BD / CL)
splom(~ cr[9 : 12], group = paste(sex, sp),
data = cr, auto.key = TRUE, pscale = 0)
The CWCL * BDCL
plot shows good species separation by a line.
Using CWCL - BDCL
will pick out the axis of separation.
splom(~ data.frame(FLCL, RWCL, CWCL - BDCL), group = paste(sex, sp),
data = cr, auto.key = TRUE, pscale = 0)
The FLCL * (CWCL - BDCL)
plot shows perfect separation.
After another rotation to FLCL - (CWCL - BDCL) = FLCL + BDCL - CWCL
a simple criterion emerges.
stripplot(paste(sex, sp) ~ FLCL + BDCL - CWCL, data = cr,
jitter = TRUE, group = paste(sex, sp),
panel = function(...) {
panel.stripplot(...)
panel.abline(v = -0.23, lty = 2)
})
RWCL
for larger crabs; less so for smaller onesxyplot(RWCL ~ CL | sp, data = cr, group = paste(sex), auto.key = TRUE)
CL
of 30 into small and large gives a simple rule that works fairly well across species and sizes.xyplot(RWCL ~ CL | sp, data = cr, group = paste(sex), auto.key = TRUE,
panel = function(...) {
panel.xyplot(...)
## panel.segments(c(0, 30), c(0.42, 0.39), c(30, 60), c(0.42, 0.39),
## lty = 2)
ds <- 0.42
dl <- 0.39
panel.lines(c(0, 30, 30, 60), c(ds, ds, dl, dl),
lty = 2, col = "black")
})
pc <- princomp(cr[9 : 12])
splom(~ pc$scores, group = paste(sex, sp),
data = cr, auto.key = TRUE, pscale = 0)
When designing a classifier it is usually a good idea to split the data into a training set and a _test set.
After building the classifier with the training data its performance can be assessed using the test data.
The olives
data frame in the extracat
package contains measurements on the proportion of 8 fatty acids (% times 100) in olive oil samples.
A goal is to see if region and perhaps area can be determined from these measurements.
This might help in discovering cheap varieties masquerading as expensive ones.
A useful approach is to first try to identify the regions and then the areas within regions.
A first look at the data:
library(lattice)
olives <- read.csv("https://stat.uiowa.edu/~luke/data/olives.csv")
splom(~ olives[3 : 10], cex = 0.1, group = Region,
data = olives, auto.key = TRUE, pscale = 0)
South
region have a much higher eicosenoic
content than oils from the other regions.stripplot(Region ~ eicosenoic, data = olives, type = c("p", "g"),
jitter = TRUE, group = Region)
North
from Sardinia
we can remove the South
samples.ons <- filter(olives, Region != "South")
ons <- droplevels(ons)
splom(~ ons[3 : 10], cex = 0.1, group = Region, data = ons,
auto.key = TRUE, pscale = 0)
arachidic * linoleic
plot allows these two regions to be separated:xyplot(arachidic ~ linoleic, group = Region, data = ons,
type = c("p", "g"), auto.key = TRUE)
South
oils the ones from Sardinia
can be identified as the ones with linoleic > 1000
and arachidic > 40
.Looking at the oils for the two areas within Sardinia:
sar <- filter(olives, Region == "Sardinia")
sar <- droplevels(sar)
splom(~ sar[3 : 10], cex = 0.1, group = Area,
data = sar, auto.key = TRUE, pscale = 0)
Sardinia
the oleic * linoleic
plot suggests looking at oleic - linoleic
stripplot(Area ~ oleic - linoleic, data = sar, jitter = TRUE,
type = c("p", "g"))
The inland oils are identified by having oleic - linoleic > 6000
.
An alternative approach is to use rggobi
to explore the data with grand tours and guided tours.
The grand tour is a framework for animating smoothly through a sequence of randomly chosen projections.
A guided tour encourages the tour to move towards projections that are more interesting according to one of several criteria.
The rggobi
approach is documented in
Cook, Dianne and Swayne, Deborah F. (2007), Interactive and Dynamic Graphics for Data Analysis with R and GGobi_, Springer Verlag.
To explore this approach with the scaled crabs
data:
library(rggobi)
ggobi(cr)
With a large number of variables it isn’t realistic to view all possible pairwise scatter plots.
This becomes even more of an issue with automated projection systems like the grand tour.
Scagnostics, or scatter plot diagnostics, are numerical summaries of interesting features in a scatter plot.
The projection pursuit indices used in the guided tour are a form of scagnostic.
One possible use is to automatically compute sacgnostics for all possible pairwise scatter plots and view the ones with high scores.
A German Wikipedia article has some graphics that illustrate the proposed diagnostics.
Wilkinson, Leland, Anushka Anand, and Robert L. Grossman. “Graph-theoretic scagnostics.” INFOVIS. Vol. 5. 2005.
Dang, Tuan Nhon, and Leland Wilkinson. “Scagexplorer: Exploring scatterplots by their scagnostics.” Visualization Symposium (PacificVis), 2014 IEEE Pacific. IEEE, 2014.
library(scagnostics)
## Loading required package: rJava
s <- scagnostics(olives[3 : 10])
heatmap(s)
colnames(s)[max.col(s)]
## [1] "stearic * arachidic" "palmitic * eicosenoic" "oleic * eicosenoic"
## [4] "linolenic * arachidic" "oleic * linolenic" "linolenic * arachidic"
## [7] "oleic * linoleic" "palmitic * arachidic" "palmitoleic * oleic"
splom(t(unclass(s)), varname.cex = 0.7, pscale = 0)
This function creates a shiny
animation using two sliders:
## two-parameter shiny
## performance is not good
shinyAnim2 <- function(fun, v, lo, hi, step = NULL, label = "Frame", fps = 4) {
require(shiny)
msecs <- 1000 / fps
shinyApp(
ui = fluidPage(
sidebarLayout(
sidebarPanel(
sliderInput("v1", label[1], lo[1], hi[1], v[1], step[1],
animate = animationOptions(msecs)),
sliderInput("v2", label[2], lo[2], hi[2], v[2], step[2],
animate = animationOptions(msecs))),
mainPanel(plotOutput("plot"))
)
),
server = function(input, output, session) {
session$onSessionEnded(function() shiny::stopApp())
output$plot <- renderPlot(fun(input$v1, input$v2))
})
}
Some calibration data:
library(tidyverse)
library(shiny)
library(lattice)
library(ggplot2)
n <- 10000
d <- data.frame(x1 = runif(n, -1, 1), x2 = runif(n, -1, 1), x3 = rnorm(n))
d <- mutate(d,
x4 = as.numeric(scale(x1 + cos(x2) + sin(x3) + rnorm(n, 0, 0.1))))
Animation using splom
:
fd <- function(v1, v2)
splom(~ d, data = d, group = abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1)
shinyAnim2(fd, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05))
Using xyplot
:
fd2 <- function(v1, v2)
xyplot(x4 ~ x3, group = abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1,
data = d, pch = 19)
shinyAnim2(fd2, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05))
Using ggplot
:
fd3 <- function(v1, v2)
ggplot(d, aes(x = x3, y = x4)) + geom_point(color = "grey") +
geom_point(data = filter(d, abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1),
color = "red")
shinyAnim2(fd3, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05))
Rendering of ggplot
and lattice
graphics is slow and not ideal for animations; using base graphics helps a little:
fd4 <- function(v1, v2) {
plot(x4 ~ x3, data = d, pch = 19, col = "blue", asp = 1)
with(filter(d, abs(x1 - v1) < 0.1 & abs(x2 - v2) < 0.1),
points(x3, x4, col = "red", pch = 19))
}
shinyAnim2(fd4, c(0, 0), c(-1, -1), c(1, 1), c(0.05, 0.05))