---
title: "Snowfall in Iowa City"
output:
html_document:
toc: yes
code_download: true
code_folding: "show"
---
Adapted from a [blog
post](https://sctyner.github.io/redoing-graphs.html) by [Sam
Tyner](https://sctyner.github.io)
```{r global_options, include = FALSE}
knitr::opts_chunk$set(collapse = TRUE, fig.align = "center")
```
```{r, message = FALSE}
library(dplyr)
library(tidyr)
library(rnoaa)
library(ggplot2)
library(ggrepel)
library(patchwork)
theme_set(theme_minimal() + theme(text = element_text(size = 16)))
```
## Getting the Data from NOAA
Climate data is available via a [web service
API](https://www.ncdc.noaa.gov/cdo-web/webservices/v2) from
[NOAA](https://www.noaa.gov/).
The
[`rnoaa`](https://cran.r-project.org/package=rnoaa)
package provides an interface to this API. A
[tutorial](https://docs.ropensci.org/rnoaa/articles/rnoaa.html) is
available from [rOpenSci](https://ropensci.org).
This code would be used to find the most suitable Iowa City station:
```{r, eval = FALSE}
options("") ## may not be needed
stat <- ghcnd_stations()
ic_stat <- filter(stat, grepl("IOWA CITY", name) & element == "SNOW")
select(ic_stat, name, id, ends_with("year"))
```
The best station seems to be
```{r}
ic_id <- "USC00134101"
```
The current data for this station can then be obtained from NOAA:
```{r, eval = FALSE}
ic_data_raw <- ghcnd(ic_id, refresh = TRUE)
## goes into a cache file as csv
## /home/luke/.cache/rnoaa/ghcnd/USC00134101.dly
write.csv(ic_data_raw, row.names = FALSE, file = "ic_noaa.csv")
## gzip and move to data directory
```
I have done these steps and stored the data locally; I will update
the data as the course progresses.
Download the local data copy:
```{r}
if (! file.exists("ic_noaa.csv.gz"))
download.file("http://www.stat.uiowa.edu/~luke/data/ic_noaa.csv.gz",
"ic_noaa.csv.gz")
## make sure data is up to date
ic_data_raw <- as_tibble(read.csv("ic_noaa.csv.gz", head = TRUE))
```
## Processing the Data
The raw data:
```{r}
ic_data_raw
```
The documentation for the data is available at
> https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
The available _elements_, or measurements, are:
```{r}
unique(ic_data_raw$element)
```
The primary ones are:
```
PRCP = Precipitation (tenths of mm)
SNOW = Snowfall (mm)
SNWD = Snow depth (mm)
TMAX = Maximum temperature (tenths of degrees C)
TMIN = Minimum temperature (tenths of degrees C)
```
The values for day $k$ are stored in the variable `VALUE`$k$. A selection:
```{r}
ic_feb2019 <- filter(ic_data_raw, year == 2019, month == 2)
select(ic_feb2019, element, VALUE1, VALUE28, VALUE31)
```
We need to:
- Reshape the data.
- Deal with the bogus days.
Start by selecting the columns we need:
```{r}
ic_data <- select(ic_data_raw, year, month, element, starts_with("VALUE"))
ic_data
```
Reshape from (very) wide to (too) long:
```{r}
ic_data <- pivot_longer(ic_data, VALUE1 : VALUE31,
names_to = "day", values_to = "value")
ic_data
```
Extract the day as a number:
```{r}
ic_data <- mutate(ic_data, day = readr::parse_number(day))
ic_data
```
Reshape from too long to tidy with one row per day, keeping only the
primary variables:
```{r}
corevars <- c("TMAX", "TMIN", "PRCP", "SNOW", "SNWD")
ic_data <- filter(ic_data, element %in% corevars)
ic_data <- pivot_wider(ic_data, names_from = "element", values_from = "value")
ic_data
```
Add a `date` variable for plotting and to help get rid of bogus days:
```{r}
ic_data <- mutate(ic_data, date = lubridate::make_date(year, month, day))
filter(ic_data, year == 2019, month == 2, day >= 27)
```
Get rid of the bogus days:
```{r}
ic_data <- filter(ic_data, ! is.na(date))
```
Make units more standard (American):
```{r}
mm2in <- function(x) x / 25.4
C2F <- function(x) 32 + 1.8 * x
ic_data <- transmute(ic_data, year, month, day, date,
PRCP = mm2in(PRCP / 10),
SNOW = mm2in(SNOW),
SNWD = mm2in(SNWD),
TMIN = C2F(TMIN / 10),
TMAX = C2F(TMAX / 10))
```
## The Winters of 2018/2019 and 2020/2021
### Daily Snowfall
Extract the data for days between November 1 and March 31:
```{r}
w18 <- filter(ic_data, date >= "2018-11-01" & date <= "2019-03-31")
w20 <- filter(ic_data, date >= "2020-11-01" & date <= "2021-03-31")
```
A first set of plots:
```{r, fig.width = 9}
p1 <- ggplot(w18, aes(x = date)) + geom_col(aes(y = SNOW))
p2 <- ggplot(w18, aes(x = date)) + geom_line(aes(y = SNWD))
p1 + p2
```
For daily snowfall it is probably reasonable to replace missing values
by zeros.
```{r}
w18 <- mutate(w18, SNOW = ifelse(is.na(SNOW), 0, SNOW))
w20 <- mutate(w20, SNOW = ifelse(is.na(SNOW), 0, SNOW))
```
For snow depth it may make sense to _fill forward_, replacing misusing
values by the previous non-missing value. Using the `fill` function
from `tidyr` is one way to do this.
```{r}
w18 <- fill(w18, SNWD)
w20 <- fill(w20, SNWD)
```
The plot of snow depth looks a bit better with this change; the axis
range for the plot of daily snowfall is also affected. Also, the
warnings are eliminated.
```{r, fig.width = 9}
p1 <- ggplot(w18, aes(x = date)) + geom_col(aes(y = SNOW))
p2 <- ggplot(w18, aes(x = date)) + geom_line(aes(y = SNWD))
(p1 + labs(title = "2018/19")) + p2
(p1 %+% w20 + labs(title = "2020/21")) + (p2 %+% w20)
```
Often snowfall is visualized in terms of the cumulative snowfall for
the season up to each date:
```{r}
p <- ggplot(w18, aes(x = date)) + geom_line(aes(y = cumsum(SNOW)))
p + labs(title = "2018/19")
p %+% w20 + labs(title = "2020/21")
```
### Comparing to Long Term Averages
Compute the average snowfall for each day of the year and merge the
results for the days in `w18` into `w18` with a `left_join`:
```{r, message = FALSE}
ic_avg <- group_by(ic_data, month, day) %>%
summarize(av_snow = mean(SNOW, na.rm = TRUE),
av_snwd = mean(SNWD, na.rm = TRUE)) %>%
ungroup()
w18 <- left_join(w18, ic_avg, c("month", "day"))
w20 <- left_join(w20, ic_avg, c("month", "day"))
```
Daily snowfall and snow cover for the 2018/19 winter along with
averages over all years in the record:
```{r, fig.width = 9}
p1 <- ggplot(w18, aes(x = date)) +
geom_col(aes(y = SNOW)) +
geom_line(aes(y = av_snow), color = "blue")
p2 <- ggplot(w18, aes(x = date)) +
geom_line(aes(y = SNWD)) +
geom_line(aes(y = av_snwd), color = "blue")
(p1 + labs(title = "2018/19")) + p2
(p1 %+% w20 + labs(title = "2020/21")) + (p2 %+% w20)
```
Cumulative snowfall for the 2018/19 winter along with average
cumulative snowfall over all years in the record:
```{r}
p <- ggplot(w18, aes(x = date)) +
geom_line(aes(y = cumsum(SNOW))) +
geom_line(aes(y = cumsum(av_snow)), color = "blue")
p + labs(title = "2018/19")
p %+% w20 + labs(title = "2020/21")
```
### A Combined Graph
```{r}
w18day <- transmute(w18, date, snow = SNOW,
base = "2018/19",
type = "Daily Snowfall")
nrmday <- transmute(w18, date, snow = av_snow,
base = "Normal",
type = "Daily Snowfall")
w18cum <- transmute(w18day, date, snow = cumsum(snow),
base = "2018/19",
type = "Cumulative Snowfall")
nrmcum <- transmute(nrmday, date, snow = cumsum(snow),
base = "Normal",
type = "Cumulative Snowfall")
```
```{r, fig.height = 8}
ggplot(w18, aes(x = date, y = snow, color = base)) +
geom_line(data = w18cum) +
geom_line(data = nrmcum) +
geom_segment(aes(x = date, xend = date, y = 0, yend = snow),
size = 1, data = w18day) +
geom_line(data = nrmday) +
facet_wrap(~ type, ncol = 1, scales = "free_y") +
xlab(element_blank()) + ylab("Snowfall (inches)") +
ggtitle("2018/19 Snowfall in Iowa City") +
scale_color_manual(values = c("2018/19" = "navy",
"Normal" = "forestgreen")) +
guides(color = guide_legend(override.aes = list(size = 1.5))) +
theme_bw() +
theme(legend.position = "bottom", legend.title = element_blank(),
plot.title = element_text(face = "bold", hjust = .5),
aspect.ratio = 0.4)
```
### Total Snow Fall by Year and Month
It is helpful to define a variable that shifts the year to include a
full winter, not the end of one and the start of the next.
What month should be start with? Here are the counts for months with
recorded snowfall:
```{r}
count(filter(ic_data, SNOW > 0), month)
```
The July snowfall seems odd:
```{r}
filter(ic_data, SNOW > 0, month == 7)
```
This is probably a recording error.
For October, the years with recorded recorded snowfall are shown by
```{r}
count(filter(ic_data, SNOW > 0, month == 10), year)
```
It seems reasonable to start the _winter year_ in October:
```{r}
ic_data <- mutate(ic_data, wyear = ifelse(month >= 10, year, year - 1))
```
The yearly total snowfall values:
```{r}
tot <- group_by(ic_data, wyear) %>%
summarize(tsnow = sum(SNOW, na.rm = TRUE))
tot18 <- filter(tot, wyear == 2018)
ggplot(tot, aes(x = wyear, y = tsnow)) +
geom_line() +
geom_point(data = tot18, color = "red") +
geom_text_repel(data = tot18, label = "2018/19", color = "red")
```
So 2018/19 is not a record, but still pretty high:
```{r}
tot100 <- mutate(filter(tot, wyear >= 1918), rank = min_rank(desc(tsnow)))
filter(tot100, wyear == 2018)
```
What are the months with the highest average total snowfall?
```{r}
mtot <- group_by(ic_data, year, wyear, month) %>%
summarize(msnow = sum(SNOW, na.rm = TRUE)) %>%
ungroup()
group_by(mtot, month) %>% summarize(av_msnow = mean(msnow))
```
For 2018/19:
```{r}
filter(mtot, wyear == 2018)
```
For 2020/21:
```{r}
filter(mtot, wyear == 2020)
```
## A Data Processing Functions
If you want to look at data for another city you would need to go
through the same steps starting with a new raw data frame.
Defining a function that encapsulates the steps makes this easy:
```{r}
processData <- function(data_raw) {
## select just the columns we need:
data <- select(data_raw, year, month, element, starts_with("VALUE"))
## reshape from (very) wide to (too) long
data <- pivot_longer(data, VALUE1 : VALUE31,
names_to = "day", values_to = "value")
## extract the day as a number:
data <- mutate(data, day = readr::parse_number(day))
## reshape from too long to tidy with one row per day, keeping
## only the primary variables:
corevars <- c("TMAX", "TMIN", "PRCP", "SNOW", "SNWD")
data <- filter(data, element %in% corevars)
data <- pivot_wider(data, names_from = "element", values_from = "value")
## add a 'date' variable and get rid of bogus days
data <- mutate(data, date = lubridate::make_date(year, month, day))
data <- filter(data, ! is.na(date))
## make units more standard (American)
mm2in <- function(x) x / 25.4
C2F <- function(x) 32 + 1.8 * x
data <- transmute(data, year, month, day, date,
PRCP = mm2in(PRCP / 10),
SNOW = mm2in(SNOW),
SNWD = mm2in(SNWD),
TMIN = C2F(TMIN / 10),
TMAX = C2F(TMAX / 10))
## add winter year variable 'wyear' starting in October
data <- mutate(data, wyear = ifelse(month >= 10, year, year - 1))
data
}
```
A quick sanity check:
```{r}
stopifnot(identical(ic_data, processData(ic_data_raw)))
```
Often it is a good idea to include a sanity check like this in a code
chunk marked as `include = FALSE`.
## Other Explorations
```{r}
pivot_longer(w18, TMIN | TMAX, names_to = "which", values_to = "value") %>%
ggplot(aes(x = date)) +
geom_line(aes(y = value, color = which))
```
```{r}
wd18 <- filter(ic_data, date >= "2018-10-01")
avs <- group_by(ic_data, month, day) %>%
summarize(avsnow = mean(SNOW, na.rm = TRUE),
avsnwd = mean(SNWD, na.rm = TRUE),
avtmin = mean(TMIN, na.rm = TRUE),
avtmax = mean(TMAX, na.rm = TRUE)) %>%
ungroup()
wd18 <- left_join(wd18, avs)
ggplot(filter(w18, ! is.na(SNWD))) +
geom_line(aes(x = date, y = SNWD)) +
geom_line(aes(x = date, y = av_snwd), col = "blue")
```
```{r}
plot(SNOW ~ date, data = wd18, type = "h")
lines(wd18$date, wd18$avsnow, col = "red")
lines(smooth.spline(wd18$date, wd18$avsnow), col = "blue")
plot(SNWD ~ date, data = fill(wd18, SNWD), type = "l")
lines(wd18$date, wd18$avsnwd, col = "red")
plot(cumsum(ifelse(is.na(SNOW), 0, SNOW)) ~ date, data = wd18, type = "l")
lines(wd18$date, cumsum(wd18$avsnow), col = "red")
```
```{r}
thm <- theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank())
ggplot(wd18) +
geom_col(aes(x = date, y = SNOW)) +
geom_line(aes(x = date, y = avsnow), col = "red") +
geom_smooth(aes(x = date, y = avsnow), se = FALSE) +
thm
ggplot(fill(wd18, SNWD)) +
geom_line(aes(x = date, y = SNWD)) +
geom_line(aes(x = date, y = avsnwd), col = "red") +
geom_smooth(aes(x = date, y = avsnwd), se = FALSE) +
thm
ggplot(wd18) +
geom_line(aes(x = date, y = cumsum(ifelse(is.na(SNOW), 0, SNOW)))) +
geom_line(aes(x = date, y = cumsum(wd18$avsnow)), col = "red") +
thm
```
```{r}
ggplot(filter(ic_data, year == 2019), aes(x = date)) +
geom_line(aes(y = TMAX, color = "TMAX")) +
geom_line(aes(y = TMIN, color = "TMIN")) +
thm
```
```{r}
d13 <- filter(ic_data, wyear >= 2013, month <= 4 | month >= 10)
d13 <- mutate(d13, wday = as.numeric(date - lubridate::make_date(wyear, 10, 1)),
SNOW = ifelse(is.na(SNOW), 0, SNOW))
ggplot(d13) +
geom_col(aes(x = wday, y = SNOW), width = 3, position = "identity") +
facet_wrap(~ wyear) + thm
ggplot(d13) + geom_line(aes(x = wday, y = SNOW)) + facet_wrap(~ wyear) + thm
```
```{r}
d13 <- group_by(d13, wyear) %>%
mutate(snow_total = cumsum(SNOW)) %>%
ungroup()
ggplot(d13) +
geom_line(aes(x = wday, y = snow_total, color = factor(wyear))) + thm
ggplot(d13) +
geom_line(aes(x = wday, y = snow_total)) +
facet_wrap(~ wyear) + thm
```
```{r}
d13 <- fill(d13, SNWD)
ggplot(d13) + geom_line(aes(x = wday, y = SNWD)) + facet_wrap(~ wyear) + thm
```
```{r}
w <- filter(ic_data, wyear >= 2013)
w <- mutate(w, wday = date - lubridate::make_date(wyear, 10, 1))
w <- filter(w, wday <= 180)
w <- mutate(w, wday = lubridate::make_date(2018, 10, 1) + wday)
ggplot(w) +
geom_line(aes(x = wday, y = TMAX), color = "blue") +
geom_line(aes(x = wday, y = TMIN), color = "red") +
facet_wrap(~ wyear)
```
- Better x axis label for winter days
- set theme globally
- re-create original plot in some variants
- better facet labels