Background

library(readr)
if (! file.exists("vehicles.csv.zip"))
    download.file("http://www.stat.uiowa.edu/~luke/data/vehicles.csv.zip",
                  "vehicles.csv.zip")
newmpg <- read_csv("vehicles.csv.zip", guess_max = 10000)
## Warning: 10197 parsing failures.
##   row     col           expected actual               file
## 23158 mfrCode 1/0/T/F/TRUE/FALSE    ADX 'vehicles.csv.zip'
## 23159 mfrCode 1/0/T/F/TRUE/FALSE    ADX 'vehicles.csv.zip'
## 23160 mfrCode 1/0/T/F/TRUE/FALSE    ADX 'vehicles.csv.zip'
## 23161 mfrCode 1/0/T/F/TRUE/FALSE    ADX 'vehicles.csv.zip'
## 23162 mfrCode 1/0/T/F/TRUE/FALSE    BEX 'vehicles.csv.zip'
## ..... ....... .................. ...... ..................
## See problems(...) for more details.
library(nycflights13)

Selecting Variables

newmpg1 <- select(newmpg,
                  make, model, year,
                  cty = city08,
                  trans = trany,
                  cyl = cylinders,
                  fuel = fuelType1,
                  displ)
head(newmpg1, 2)
## # A tibble: 2 x 8
##   make      model            year   cty trans        cyl fuel         displ
##   <chr>     <chr>           <dbl> <dbl> <chr>      <dbl> <chr>        <dbl>
## 1 Alfa Rom… Spider Veloce …  1985    19 Manual 5-…     4 Regular Gas…   2  
## 2 Ferrari   Testarossa       1985     9 Manual 5-…    12 Regular Gas…   4.9

Some variations:

nm <- select(newmpg1, year : trans)
head(nm, 2)
## # A tibble: 2 x 3
##    year   cty trans       
##   <dbl> <dbl> <chr>       
## 1  1985    19 Manual 5-spd
## 2  1985     9 Manual 5-spd
nm <- select(newmpg1, -year , -trans)
head(nm, 2)
## # A tibble: 2 x 6
##   make       model                cty   cyl fuel             displ
##   <chr>      <chr>              <dbl> <dbl> <chr>            <dbl>
## 1 Alfa Romeo Spider Veloce 2000    19     4 Regular Gasoline   2  
## 2 Ferrari    Testarossa             9    12 Regular Gasoline   4.9
nm <- select(newmpg1, -(year : trans))
head(nm, 2)
## # A tibble: 2 x 5
##   make       model                cyl fuel             displ
##   <chr>      <chr>              <dbl> <chr>            <dbl>
## 1 Alfa Romeo Spider Veloce 2000     4 Regular Gasoline   2  
## 2 Ferrari    Testarossa            12 Regular Gasoline   4.9

Numerical column specifications can also be used:

nm <- select(newmpg1, 1, 3)
head(nm, 2)
## # A tibble: 2 x 2
##   make        year
##   <chr>      <dbl>
## 1 Alfa Romeo  1985
## 2 Ferrari     1985
nm <- select(newmpg1, 1 : 3)
head(nm, 2)
## # A tibble: 2 x 3
##   make       model               year
##   <chr>      <chr>              <dbl>
## 1 Alfa Romeo Spider Veloce 2000  1985
## 2 Ferrari    Testarossa          1985
nm <- select(newmpg1, -(1 : 3))
head(nm, 2)
## # A tibble: 2 x 5
##     cty trans          cyl fuel             displ
##   <dbl> <chr>        <dbl> <chr>            <dbl>
## 1    19 Manual 5-spd     4 Regular Gasoline   2  
## 2     9 Manual 5-spd    12 Regular Gasoline   4.9

Some helper functions can also be used in the column specifications.

The most useful ones are starts_with, ends_with, and contains.

nm <- select(newmpg, starts_with("fuel"))
head(nm, 2)
## # A tibble: 2 x 5
##   fuelCost08 fuelCostA08 fuelType fuelType1        fuelType2
##        <dbl>       <dbl> <chr>    <chr>            <chr>    
## 1       1600           0 Regular  Regular Gasoline <NA>     
## 2       3050           0 Regular  Regular Gasoline <NA>
nm <- select(newmpg, contains("city"))
head(nm, 2)
## # A tibble: 2 x 12
##   city08 city08U cityA08 cityA08U cityCD cityE cityUF rangeCity rangeCityA
##    <dbl>   <dbl>   <dbl>    <dbl>  <dbl> <dbl>  <dbl>     <dbl>      <dbl>
## 1     19       0       0        0      0     0      0         0          0
## 2      9       0       0        0      0     0      0         0          0
## # … with 3 more variables: UCity <dbl>, UCityA <dbl>, phevCity <dbl>

These can also be preceded by a minus modifier to omit columns:

nm <- select(newmpg1, -contains("m"))
head(nm, 2)
## # A tibble: 2 x 6
##    year   cty trans          cyl fuel             displ
##   <dbl> <dbl> <chr>        <dbl> <chr>            <dbl>
## 1  1985    19 Manual 5-spd     4 Regular Gasoline   2  
## 2  1985     9 Manual 5-spd    12 Regular Gasoline   4.9

Sometimes it is useful to move a few columns to the front; the everything helper can be used for that:

head(flights, 2)
## # A tibble: 2 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
fl <- select(flights, air_time, everything())
head(fl, 2)
## # A tibble: 2 x 19
##   air_time  year month   day dep_time sched_dep_time dep_delay arr_time
##      <dbl> <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1      227  2013     1     1      517            515         2      830
## 2      227  2013     1     1      533            529         4      850
## # … with 11 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

The variant select_if allows columns to be selected based on a predicate applied to the full column:

nm <- select_if(newmpg1, anyNA)
head(nm, 2)
## # A tibble: 2 x 3
##   trans          cyl displ
##   <chr>        <dbl> <dbl>
## 1 Manual 5-spd     4   2  
## 2 Manual 5-spd    12   4.9
fl <- select_if(flights, anyNA)
head(fl, 2)
## # A tibble: 2 x 6
##   dep_time dep_delay arr_time arr_delay tailnum air_time
##      <int>     <dbl>    <int>     <dbl> <chr>      <dbl>
## 1      517         2      830        11 N14228       227
## 2      533         4      850        20 N24211       227

Columns can also be selected using list operations:

vars <- c("make", "model", "year", "city08", "trany", "cylinders",
          "fuelType1", "displ")
nm <- newmpg[vars]
head(nm, 2)
## # A tibble: 2 x 8
##   make     model         year city08 trany     cylinders fuelType1    displ
##   <chr>    <chr>        <dbl>  <dbl> <chr>         <dbl> <chr>        <dbl>
## 1 Alfa Ro… Spider Velo…  1985     19 Manual 5…         4 Regular Gas…   2  
## 2 Ferrari  Testarossa    1985      9 Manual 5…        12 Regular Gas…   4.9

Filtering Rows

Filter selects the subset of rows that satisfy one or more conditions

All cars with city MPG at or above 130 and model year 2018:

filter(newmpg1, cty >= 130, year == 2018)
## # A tibble: 2 x 8
##   make    model              year   cty trans          cyl fuel       displ
##   <chr>   <chr>             <dbl> <dbl> <chr>        <dbl> <chr>      <dbl>
## 1 Hyundai Ioniq Electric     2018   150 Automatic (…    NA Electrici…    NA
## 2 Tesla   Model 3 Long Ran…  2018   136 Automatic (…    NA Electrici…    NA

All flights from New York to Des Moines on January 1, 2013:

filter(flights, day == 1, month == 1, dest == "DSM")
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1     2119           1930       109     2358
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Comparisons

  • The basic comparison operators are

    • ==, !=
    • <, <=
    • >, >=
  • Be sure to use ==, not a single = character.

  • Be careful about floating point equality tests:

x <- 1 - 0.9
x == 0.1
## [1] FALSE
near(x, 0.1)
## [1] TRUE
  • Equality comparisons can be used on character vectors and factors.

  • Order comparisons can be used on character vectors, but may produce surprising results, or results that vary with different locale settings.

  • %in% can be used to match against one of several options:

filter(flights, day %in% 1:2, month == 1, dest == "DSM")
## # A tibble: 2 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1     2119           1930       109     2358
## 2  2013     1     2     1921           1909        12     2143
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Logical Operations

  • The basic logical operations on vectors are

    • & – and
    • | – or
    • ! – not xor – exclusive or
  • Truth tables:

x <- c(TRUE, TRUE,  FALSE, FALSE)
y <- c(TRUE, FALSE, TRUE,  FALSE)
! x
## [1] FALSE FALSE  TRUE  TRUE
x & y
## [1]  TRUE FALSE FALSE FALSE
x | y
## [1]  TRUE  TRUE  TRUE FALSE
xor(x, y)
## [1] FALSE  TRUE  TRUE FALSE
  • The previous flights example can also be written as
filter(flights, day == 1 | day == 2, month == 1, dest == "DSM")
## # A tibble: 2 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1     2119           1930       109     2358
## 2  2013     1     2     1921           1909        12     2143
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
  • It can also be written as
filter(flights, (day == 1 | day == 2) & month == 1 & dest == "DSM")
## # A tibble: 2 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1     2119           1930       109     2358
## 2  2013     1     2     1921           1909        12     2143
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Missing Values

  • filter only keeps values corresponding to TRUE predicate values; FALSE and NA values are dropped.

  • Arithmetic and comparison operators will always produce NA values if one operand is NA.

1 + NA
## [1] NA
NA < 1
## [1] NA
NA == NA
## [1] NA
  • In a few cases a non-NA result can be determined:
TRUE | NA
## [1] TRUE
NA & FALSE
## [1] FALSE
NA ^ 0
## [1] 1
  • is.na can be used to also select rows with NA values.

Non-electric vehicles with zero or NA for displ:

filter(newmpg1, displ == 0 | is.na(displ), fuel != "Electricity")
## # A tibble: 2 x 8
##   make   model     year   cty trans          cyl fuel             displ
##   <chr>  <chr>    <dbl> <dbl> <chr>        <dbl> <chr>            <dbl>
## 1 Subaru RX Turbo  1985    22 Manual 5-spd    NA Regular Gasoline    NA
## 2 Subaru RX Turbo  1985    21 Manual 5-spd    NA Regular Gasoline    NA

Flights with NA for both dep_time and arr_time:

fl <- filter(flights, is.na(dep_time) & is.na(arr_time))
head(fl, 2)
## # A tibble: 2 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1       NA           1630        NA       NA
## 2  2013     1     1       NA           1935        NA       NA
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

An alternative approach that does not use filter would be

idx <- with(flights, is.na(dep_time) & is.na(arr_time))
fl <- flights[idx,]
head(fl, 2)
## # A tibble: 2 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1       NA           1630        NA       NA
## 2  2013     1     1       NA           1935        NA       NA
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Adding New Variables

mutate can be used to define new variables or modify existing ones.

fl <- select(flights, year, month, day, air_time, ends_with("delay"))
head(fl, 2)
## # A tibble: 2 x 6
##    year month   day air_time dep_delay arr_delay
##   <int> <int> <int>    <dbl>     <dbl>     <dbl>
## 1  2013     1     1      227         2        11
## 2  2013     1     1      227         4        20
fl <- mutate(fl,
             gain = arr_delay - dep_delay,
             air_hours = air_time / 60,
             gain_per_hour = gain / air_hours)
head(fl, 2)
## # A tibble: 2 x 9
##    year month   day air_time dep_delay arr_delay  gain air_hours
##   <int> <int> <int>    <dbl>     <dbl>     <dbl> <dbl>     <dbl>
## 1  2013     1     1      227         2        11     9      3.78
## 2  2013     1     1      227         4        20    16      3.78
## # … with 1 more variable: gain_per_hour <dbl>
fl <- transmute(flights,
                gain = arr_delay - dep_delay,
                air_hours = air_time / 60,
                gain_per_hour = gain / air_hours)
head(fl, 2)
## # A tibble: 2 x 3
##    gain air_hours gain_per_hour
##   <dbl>     <dbl>         <dbl>
## 1     9      3.78          2.38
## 2    16      3.78          4.23

A reasonable guess is that flights with NA values for both dep_time and arr_time were cancelled:

fl <- mutate(flights, cancelled = is.na(dep_time) & is.na(arr_time))

For the geyser data set from the MASS package we used lag to obtain the duration of the previous eruption,

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
gs <- mutate(geyser, prev_duration = lag(duration))
ggplot(gs) + geom_point(aes(x = prev_duration, y = waiting))
## Warning: Removed 1 rows containing missing values (geom_point).

The top five (there may be more if there is a tie for fifth place) in cty gas mileage:

nm <- mutate(newmpg1, cty_rank = min_rank(desc(cty)))
filter(nm, cty_rank <= 5)
## # A tibble: 8 x 9
##   make   model         year   cty trans           cyl fuel   displ cty_rank
##   <chr>  <chr>        <dbl> <dbl> <chr>         <dbl> <chr>  <dbl>    <int>
## 1 Scion  iQ EV         2013   138 Automatic (v…    NA Elect…    NA        4
## 2 BMW    i3 BEV        2014   137 Automatic (A…    NA Elect…    NA        5
## 3 BMW    i3 BEV        2015   137 Automatic (A…    NA Elect…    NA        5
## 4 BMW    i3 BEV        2016   137 Automatic (A…    NA Elect…    NA        5
## 5 BMW    i3 BEV (60 …  2017   137 Automatic (A…    NA Elect…    NA        5
## 6 Hyund… Ioniq Elect…  2017   150 Automatic (A…    NA Elect…    NA        1
## 7 Hyund… Ioniq Elect…  2018   150 Automatic (A…    NA Elect…    NA        1
## 8 Hyund… Ioniq Elect…  2019   150 Automatic (A…    NA Elect…    NA        1

The percent_rank function produces a ‘percentile rank’ (but between 0 and 1).

The top 5% of countries in life expectancy in 2007 from the gapminder data set:

library(gapminder)
gm <- filter(gapminder, year == 2007)
gm_top <- filter(gm, percent_rank(desc(lifeExp)) <= 0.05)
gm_top
## # A tibble: 8 x 6
##   country          continent  year lifeExp       pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Australia        Oceania    2007    81.2  20434176    34435.
## 2 Hong Kong, China Asia       2007    82.2   6980412    39725.
## 3 Iceland          Europe     2007    81.8    301931    36181.
## 4 Israel           Asia       2007    80.7   6426679    25523.
## 5 Japan            Asia       2007    82.6 127467972    31656.
## 6 Spain            Europe     2007    80.9  40448191    28821.
## 7 Sweden           Europe     2007    80.9   9031088    33860.
## 8 Switzerland      Europe     2007    81.7   7554661    37506.

And the bottom 5%:

filter(gm, percent_rank(lifeExp) <= 0.05)
## # A tibble: 8 x 6
##   country      continent  year lifeExp      pop gdpPercap
##   <fct>        <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan  Asia       2007    43.8 31889923      975.
## 2 Angola       Africa     2007    42.7 12420476     4797.
## 3 Lesotho      Africa     2007    42.6  2012649     1569.
## 4 Mozambique   Africa     2007    42.1 19951656      824.
## 5 Sierra Leone Africa     2007    42.6  6144562      863.
## 6 Swaziland    Africa     2007    39.6  1133066     4513.
## 7 Zambia       Africa     2007    42.4 11746035     1271.
## 8 Zimbabwe     Africa     2007    43.5 12311143      470.

Within these results the original row order is retained.

The arrange function can be used to change this.

Arranging Rows

arrange reorders the rows according to one or more variables.

By default the order is smallest to largest; using desc() reverses this.

Additional variables will be used to break ties.

Arranging the top 5% of countries in life expectancy in 2007 by life expectancy:

arrange(gm_top, desc(lifeExp))
## # A tibble: 8 x 6
##   country          continent  year lifeExp       pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Japan            Asia       2007    82.6 127467972    31656.
## 2 Hong Kong, China Asia       2007    82.2   6980412    39725.
## 3 Iceland          Europe     2007    81.8    301931    36181.
## 4 Switzerland      Europe     2007    81.7   7554661    37506.
## 5 Australia        Oceania    2007    81.2  20434176    34435.
## 6 Spain            Europe     2007    80.9  40448191    28821.
## 7 Sweden           Europe     2007    80.9   9031088    33860.
## 8 Israel           Asia       2007    80.7   6426679    25523.

To group by continent and order by life expectancy within continent:

arrange(gm_top, continent, desc(lifeExp))
## # A tibble: 8 x 6
##   country          continent  year lifeExp       pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Japan            Asia       2007    82.6 127467972    31656.
## 2 Hong Kong, China Asia       2007    82.2   6980412    39725.
## 3 Israel           Asia       2007    80.7   6426679    25523.
## 4 Iceland          Europe     2007    81.8    301931    36181.
## 5 Switzerland      Europe     2007    81.7   7554661    37506.
## 6 Spain            Europe     2007    80.9  40448191    28821.
## 7 Sweden           Europe     2007    80.9   9031088    33860.
## 8 Australia        Oceania    2007    81.2  20434176    34435.

Missing values are sorted to the end:

df <- data.frame(x = c(5, 2, NA, 3))
arrange(df, x)
##    x
## 1  2
## 2  3
## 3  5
## 4 NA
arrange(df, desc(x))
##    x
## 1  5
## 2  3
## 3  2
## 4 NA

Using is.na you can arrange to put them at the beginning:

arrange(df, desc(is.na(x)), x)
##    x
## 1 NA
## 2  2
## 3  3
## 4  5

Summarizing and Grouping

summarize collapses a table down to one row of summaries.

The average and maximal departure and arrival delays for flights, along with the total number of flights:

summarize(flights,
          ave_dep_delay = mean(dep_delay, na.rm = TRUE),
          ave_arr_delay = mean(arr_delay, na.rm = TRUE),
          max_dep_delay = max(dep_delay, na.rm = TRUE),
          max_arr_delay = max(arr_delay, na.rm = TRUE),
          n = n())
## # A tibble: 1 x 5
##   ave_dep_delay ave_arr_delay max_dep_delay max_arr_delay      n
##           <dbl>         <dbl>         <dbl>         <dbl>  <int>
## 1          12.6          6.90          1301          1272 336776

Grouped Summaries

summary is most commonly used with grouped data to provide group level summaries.

Grouping delay summaries by destination:

fl <- summarize(group_by(flights, dest),
                ave_dep_delay = mean(dep_delay, na.rm = TRUE),
                ave_arr_delay = mean(arr_delay, na.rm = TRUE),
                max_dep_delay = max(dep_delay, na.rm = TRUE),
                max_arr_delay = max(arr_delay, na.rm = TRUE),
                n = n())
fl_dest <- fl
head(fl)
## # A tibble: 6 x 6
##   dest  ave_dep_delay ave_arr_delay max_dep_delay max_arr_delay     n
##   <chr>         <dbl>         <dbl>         <dbl>         <dbl> <int>
## 1 ABQ           13.7           4.38           142           153   254
## 2 ACK            6.46          4.85           219           221   265
## 3 ALB           23.6          14.4            323           328   439
## 4 ANC           12.9          -2.5             75            39     8
## 5 ATL           12.5          11.3            898           895 17215
## 6 AUS           13.0           6.02           351           349  2439

The group level summaries can be further transformed, for example to identify the top 10 destinations for average delays:

flr <- mutate(fl, rank = min_rank(desc(ave_dep_delay)))
filter(flr, rank <= 10)
## # A tibble: 10 x 7
##    dest  ave_dep_delay ave_arr_delay max_dep_delay max_arr_delay     n
##    <chr>         <dbl>         <dbl>         <dbl>         <dbl> <int>
##  1 ALB            23.6          14.4           323           328   439
##  2 BHM            29.7          16.9           325           291   297
##  3 CAE            35.6          41.8           222           224   116
##  4 DSM            26.2          19.0           341           322   569
##  5 JAC            26.5          28.1           198           175    25
##  6 MSN            23.6          20.2           340           364   572
##  7 OKC            30.6          30.6           280           262   346
##  8 RIC            23.6          20.1           483           463  2454
##  9 TUL            34.9          33.7           251           262   315
## 10 TYS            28.5          24.1           291           281   631
## # … with 1 more variable: rank <int>

An alternative to explicitly computing ranks and filtering is the top_n utility function:

top_n(fl, 10, ave_dep_delay)
## # A tibble: 10 x 6
##    dest  ave_dep_delay ave_arr_delay max_dep_delay max_arr_delay     n
##    <chr>         <dbl>         <dbl>         <dbl>         <dbl> <int>
##  1 ALB            23.6          14.4           323           328   439
##  2 BHM            29.7          16.9           325           291   297
##  3 CAE            35.6          41.8           222           224   116
##  4 DSM            26.2          19.0           341           322   569
##  5 JAC            26.5          28.1           198           175    25
##  6 MSN            23.6          20.2           340           364   572
##  7 OKC            30.6          30.6           280           262   346
##  8 RIC            23.6          20.1           483           463  2454
##  9 TUL            34.9          33.7           251           262   315
## 10 TYS            28.5          24.1           291           281   631

Missing Values

Most summaries will be NA if any of the data values being summarized are NA.

Typically. summary functions can be called with na.rm = TRUE to produce the summary for all non-missing values.

You can also use filter to remove rows with NA values in the variables to be summarized:

fl <- filter(flights, ! is.na(dep_delay), ! is.na(arr_delay))
summarize(fl,
          ave_dep_delay = mean(dep_delay),
          ave_arr_delay = mean(arr_delay),
          max_dep_delay = max(dep_delay),
          max_arr_delay = max(arr_delay),
          n = n())
## # A tibble: 1 x 5
##   ave_dep_delay ave_arr_delay max_dep_delay max_arr_delay      n
##           <dbl>         <dbl>         <dbl>         <dbl>  <int>
## 1          12.6          6.90          1301          1272 327346

Counts

The helper function n() provides a count of the number of rows in the table being summarized.

Including counts with grouped summaries is usually a good idea.

Summaries based on small counts are likely to be much less reliable than ones based on larger counts.

This is reflected in higher variability among averages for destinations with fewer flights:

ggplot(fl_dest) + geom_point(aes(x = n, y = ave_arr_delay))
## Warning: Removed 1 rows containing missing values (geom_point).

A plotly version allows the individual destinations to be identified easily:

library(plotly)
p <- ggplot(fl_dest) +
    geom_point(aes(x = n, y = ave_arr_delay, text = dest))
## Warning: Ignoring unknown aesthetics: text
ggplotly(p, tooltip = "text")

It would be more user-friendly to show the airport name than the FAA code.

Useful Summaries

  • Location: mean, median
  • Spread: sd, IQR, mad
  • Rank: min, quantile, max
  • Position: first, nth, last
  • Counts: n.

Sometimes it is useful to look at subsets or truncated versions of a variable.

Airlines like to arrange schedules so they are often early.

This reduces the average arrival delay but may not help travelers much.

Two options:

  • Consider only the positive delays for the average.
  • Treat negative delays as zero (zero-truncation).

The mean of the positive delays can be computed using subsetting:

mean(arr_delay[arr_delay > 0])

The mean of the zero-truncated delays can be computed using the pmax function:

mean(pmax(arr_delay, 0))

The summaries are

fl_arr <- filter(flights, ! is.na(arr_delay))
fl_avs <- summarize(group_by(fl_arr, dest),
                    ave = mean(arr_delay),
                    ave_pos = mean(arr_delay[arr_delay > 0]),
                    ave_pos_zero = mean(pmax(arr_delay, 0)),
                    n = n())
fl_avs
## # A tibble: 104 x 5
##    dest     ave ave_pos ave_pos_zero     n
##    <chr>  <dbl>   <dbl>        <dbl> <int>
##  1 ABQ     4.38    41.9        17.7    254
##  2 ACK     4.85    28.6        11.3    264
##  3 ALB    14.4     52.1        22.9    418
##  4 ANC    -2.5     12.4         7.75     8
##  5 ATL    11.3     37.8        17.8  16837
##  6 AUS     6.02    40.6        16.6   2411
##  7 AVL     8.00    30.8        14.1    261
##  8 BDL     7.05    48.3        16.9    412
##  9 BGR     8.03    50.7        19.4    358
## 10 BHM    16.9     60.1        27.0    269
## # … with 94 more rows

One approach to visualizing the results with color coding the three different summaries:

  • View the fl_avs table as a wide format with avs, ave_pos, and ave_pos_zero as three different measurement types.
  • Use gather to convert to a long format with a key variable which to hold the summary type and a value variable delay to hold the summary value:
library(tidyr)
fl <- gather(fl_avs, which, delay, 2:4)
arrange(fl, dest)
## # A tibble: 312 x 4
##    dest      n which         delay
##    <chr> <int> <chr>         <dbl>
##  1 ABQ     254 ave            4.38
##  2 ABQ     254 ave_pos       41.9 
##  3 ABQ     254 ave_pos_zero  17.7 
##  4 ACK     264 ave            4.85
##  5 ACK     264 ave_pos       28.6 
##  6 ACK     264 ave_pos_zero  11.3 
##  7 ALB     418 ave           14.4 
##  8 ALB     418 ave_pos       52.1 
##  9 ALB     418 ave_pos_zero  22.9 
## 10 ANC       8 ave           -2.5 
## # … with 302 more rows

A plot is then easy to create:

ggplot(fl, aes(x = n, y = delay, color = which)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

An alternative:

  • Use one layer per summary type.
  • Specify the summary type for each layer as a color aesthetic value.
ggplot(fl_avs, aes(x = n)) +
    geom_point(aes(y = ave, color = "ave")) +
    geom_point(aes(y = ave_pos, color = "ave_pos")) +
    geom_point(aes(y = ave_pos_zero, color = "ave_pos_zero"))
## Warning: Removed 1 rows containing missing values (geom_point).

Proportions satisfying a condition can be computed as a mean of the logical expression for the condition:

fl_props <- summarize(group_by(fl_arr, dest),
                      pd0 = mean(arr_delay > 0),
                      pd10 = mean(arr_delay > 10),
                      pd20 = mean(arr_delay > 20),
                      n = n())
fl_props
## # A tibble: 104 x 5
##    dest    pd0  pd10  pd20     n
##    <chr> <dbl> <dbl> <dbl> <int>
##  1 ABQ   0.421 0.339 0.268   254
##  2 ACK   0.394 0.246 0.167   264
##  3 ALB   0.440 0.333 0.273   418
##  4 ANC   0.625 0.125 0.125     8
##  5 ATL   0.472 0.309 0.211 16837
##  6 AUS   0.408 0.284 0.216  2411
##  7 AVL   0.456 0.257 0.207   261
##  8 BDL   0.350 0.262 0.223   412
##  9 BGR   0.383 0.299 0.246   358
## 10 BHM   0.450 0.361 0.305   269
## # … with 94 more rows

These can be visualized similarly:

fl <- gather(fl_props, which, prop, 2:4)
ggplot(fl, aes(x = n, y = prop, color = which)) + geom_point()

Grouped Mutate and Filter

mutate, filter, arrange and top_n can be used on a group level.

mutate within groups can used for standardizing within a group.

For example, to compute the proportion of a continent’s population living in each country for each year:

gm <- mutate(group_by(gapminder, continent, year),
             pop = as.numeric(pop),
             ppop = pop / sum(pop))

The grouping is still in effect for the result, so will be used by grouping-aware operations.

Focus on the 2 countries in each continent with the highest proportions of the continent’s population:

gmt2 <- top_n(gm, 2, ppop)
gmt2 <- filter(gmt2, continent != "Oceania")
ggplot(gmt2, aes(x = year, y = ppop, group = country, color = country)) +
    geom_line() + geom_point() +
    facet_wrap(~continent, scales="free_y")

Another approach:

A useful feature it that the %+% operator can be used to change the base data frame for a plot:

pAf <- ggplot(filter(gmt2, continent == "Africa"),
              aes(x = year, y = ppop, col = country)) +
    geom_line() + geom_point() +
    facet_wrap(~continent) +
    theme(legend.position = "top", legend.title = element_blank())
thm_nx <- theme(axis.title.x = element_blank())
thm_ny <- theme(axis.title.y = element_blank())
pAm <- pAf %+% filter(gmt2, continent == "Americas")
pAs <- pAf %+% filter(gmt2, continent == "Asia")
pEu <- pAf %+% filter(gmt2, continent == "Europe")
cowplot::plot_grid(pAf + thm_nx, pAm + thm_nx +thm_ny, pAs, pEu + thm_ny)

Changes over time within a country can also be examined with a grouped mutate and the lag operator:

gm <- mutate(group_by(gapminder, continent, country),
             le_change = lifeExp - lag(lifeExp),
             gdp_change = gdpPercap - lag(gdpPercap),
             gdp_pct_change = 100 * gdp_change / lag(gdpPercap))

top_n applied to this grouped result can be used to extract the rows with the highest gdp_pct_change for each country:

top_n(gm, 1, gdp_pct_change)
## # A tibble: 142 x 9
## # Groups:   continent, country [142]
##    country continent  year lifeExp    pop gdpPercap le_change gdp_change
##    <fct>   <fct>     <int>   <dbl>  <int>     <dbl>     <dbl>      <dbl>
##  1 Afghan… Asia       2007    43.8 3.19e7      975.     1.70        248.
##  2 Albania Europe     2002    75.7 3.51e6     4604.     2.70       1411.
##  3 Algeria Africa     1972    54.5 1.48e7     4183.     3.11        936.
##  4 Angola  Africa     2007    42.7 1.24e7     4797.     1.73       2024.
##  5 Argent… Americas   2007    75.3 4.03e7    12779.     0.980      3982.
##  6 Austra… Oceania    1967    71.1 1.19e7    14526.     0.170      2309.
##  7 Austria Europe     1957    67.5 6.97e6     8843.     0.68       2706.
##  8 Bahrain Asia       2007    75.6 7.09e5    29796.     0.84       6392.
##  9 Bangla… Asia       2007    64.1 1.50e8     1391.     2.05        255.
## 10 Belgium Europe     1972    71.4 9.71e6    16672.     0.5        3523.
## # … with 132 more rows, and 1 more variable: gdp_pct_change <dbl>

Removing the grouping allows the best row overall to be computed:

top_n(ungroup(gm), 1, gdp_pct_change)
## # A tibble: 1 x 9
##   country continent  year lifeExp    pop gdpPercap le_change gdp_change
##   <fct>   <fct>     <int>   <dbl>  <int>     <dbl>     <dbl>      <dbl>
## 1 Libya   Africa     1967    50.2 1.76e6    18773.      2.42     12016.
## # … with 1 more variable: gdp_pct_change <dbl>

Applying a new grouping, for example by continent, replaces the old grouping. This can be used to find the worst drop in life expectancy for each continent:

top_n(group_by(gm, continent), 1, desc(le_change))
## # A tibble: 5 x 9
## # Groups:   continent [5]
##   country continent  year lifeExp    pop gdpPercap le_change gdp_change
##   <fct>   <fct>     <int>   <dbl>  <int>     <dbl>     <dbl>      <dbl>
## 1 Austra… Oceania    1967    71.1 1.19e7    14526.     0.170     2309. 
## 2 Cambod… Asia       1977    31.2 6.98e6      525.    -9.10       103. 
## 3 El Sal… Americas   1977    56.7 4.28e6     5139.    -1.51       619. 
## 4 Monten… Europe     2002    74.0 7.20e5     6557.    -1.46        91.6
## 5 Rwanda  Africa     1992    23.6 7.29e6      737.   -20.4       -111. 
## # … with 1 more variable: gdp_pct_change <dbl>

Joins

This interactive plot uses the tooltip to show the airport code for the destination:

library(plotly)
p <- ggplot(fl_dest) +
    geom_point(aes(x = n, y = ave_arr_delay, text = dest))
## Warning: Ignoring unknown aesthetics: text
ggplotly(p, tooltip = "text")

A nicer approach would be to show the airport name.

The airports table provides this information:

head(airports, 2)
## # A tibble: 2 x 8
##   faa   name                      lat   lon   alt    tz dst   tzone        
##   <chr> <chr>                   <dbl> <dbl> <int> <dbl> <chr> <chr>        
## 1 04G   Lansdowne Airport        41.1 -80.6  1044    -5 A     America/New_…
## 2 06A   Moton Field Municipal …  32.5 -85.7   264    -6 A     America/Chic…

We need to get this information into the fl_dest data frame.

As another example, we might want to explore whether aircraft age is related to delays.

The planes table provides the year of manufacture:

head(planes, 2)
## # A tibble: 2 x 9
##   tailnum  year type       manufacturer   model  engines seats speed engine
##   <chr>   <int> <chr>      <chr>          <chr>    <int> <int> <int> <chr> 
## 1 N10156   2004 Fixed win… EMBRAER        EMB-1…       2    55    NA Turbo…
## 2 N102UW   1998 Fixed win… AIRBUS INDUST… A320-…       2   182    NA Turbo…

Again, we need to merge this information with the data in the flights table.

One way to do this is with a join operation.

Joins are a common operation in data bases.

dplyr distinguishes between mutating joins and filtering joins

Mutating Joins

Mutating joins combine variables and rows from two tables by matching rows on keys.

  • A primary key is a variable, or combination of variables, in a table that uniquely identify the rows of the table.

  • A foreign key is a variable, or combination of variables, that uniquely identify a row in some table (usually, but not necessarily, a different table).

The simplest join operation is an inner join: Only rows for which the key appears both tables are retained.

Outer joins retain some other rows with NA values for new variables:

  • Inner and left joins are the most common.

  • The key of one of the tables should usually be a primary key that uniquely identifies the table’s rows.

  • For a left join, the right table key should usually use a primary key.

It is usually a good idea to make sure a primary key candidate really does uniquely identify rows.

For the planes table the tailnum variable is a primary key;

filter(count(planes, tailnum), n > 1)
## # A tibble: 0 x 2
## # … with 2 variables: tailnum <chr>, n <int>

count is a helper function that captures a common group summary:

pl <- count(planes, tailnum)
head(pl, 2)
## # A tibble: 2 x 2
##   tailnum     n
##   <chr>   <int>
## 1 N10156      1
## 2 N102UW      1
pl <- summarize(group_by(planes, tailnum), n = n())
head(pl, 2)
## # A tibble: 2 x 2
##   tailnum     n
##   <chr>   <int>
## 1 N10156      1
## 2 N102UW      1

For the flights data a primary key needs to use multiple variables.

A reasonable guess is to use the date, carrier and flight number, but this isn’t quite enough:

fl <- filter(count(flights, year, month, day, flight, carrier), n > 1)
head(fl, 2)
## # A tibble: 2 x 6
##    year month   day flight carrier     n
##   <int> <int> <int>  <int> <chr>   <int>
## 1  2013     6     8   2269 WN          2
## 2  2013     6    15   2269 WN          2

Adding the origin airport code does produce a primary key:

filter(count(flights, year, month, day, flight, carrier, origin), n > 1)
## # A tibble: 0 x 7
## # … with 7 variables: year <int>, month <int>, day <int>, flight <int>,
## #   carrier <chr>, origin <chr>, n <int>

To relate delays to aircraft age we can start with summaries by tailnumand year (year would be needed if we had data for more than 2013):

fl <- filter(flights, ! is.na(dep_time) , ! is.na(arr_time))
fl <- summarize(group_by(fl, year, tailnum),
                ave_dep_delay = mean(dep_delay, na.rm = TRUE),
                ave_arr_delay = mean(arr_delay, na.rm = TRUE),
                n = n())

A reduced planes table with year renamed to avoid a conflict:

pl <- select(planes, tailnum, plane_year = year)

Now a left join using tailnum as the key to bring the plane_year value into the summary table and compute the plane age:

fl <- left_join(fl, pl, "tailnum")
fl <- mutate(fl, plane_age = year - plane_year)
fl_age <- fl

An initial plot:

ggplot(fl, aes(x = plane_age, y = ave_dep_delay)) + geom_point()
## Warning: Removed 791 rows containing missing values (geom_point).

Jittering and adjusting the point size may help:

ggplot(fl, aes(x = plane_age, y = ave_dep_delay)) +
    geom_point(position = "jitter", size = 0.1)
## Warning: Removed 791 rows containing missing values (geom_point).

Adding a smooth and adjusting the ranges may also help:

ggplot(fl, aes(x = plane_age, y = ave_dep_delay)) +
    geom_point(position = "jitter", size = 0.1) +
    geom_smooth() +
    xlim(0, 30) + ylim(c(-20, 80))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 841 rows containing non-finite values (stat_smooth).
## Warning: Removed 881 rows containing missing values (geom_point).

Another option is to compute means or medians within years:

flm <- summarize(group_by(fl, plane_age),
                 ave_dep_delay = mean(ave_dep_delay, na.rm = TRUE))
ggplot(flm, aes(x = plane_age, y = ave_dep_delay)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

Both show an increase over the first 10 years.

A left join can also be used to add the airport name from the airport table to the fl_dest table.

The primary key in the airport table is the three-letter airport code in the faa variable:

head(airports, 2)
## # A tibble: 2 x 8
##   faa   name                      lat   lon   alt    tz dst   tzone        
##   <chr> <chr>                   <dbl> <dbl> <int> <dbl> <chr> <chr>        
## 1 04G   Lansdowne Airport        41.1 -80.6  1044    -5 A     America/New_…
## 2 06A   Moton Field Municipal …  32.5 -85.7   264    -6 A     America/Chic…

The foreign key in the fl_dest table to match is the dest variable:

head(fl_dest, 2)
## # A tibble: 2 x 6
##   dest  ave_dep_delay ave_arr_delay max_dep_delay max_arr_delay     n
##   <chr>         <dbl>         <dbl>         <dbl>         <dbl> <int>
## 1 ABQ           13.7           4.38           142           153   254
## 2 ACK            6.46          4.85           219           221   265

The left_join function allows this link to be made by specifying the key as c("dest" = "faa"):

fl_dest <- left_join(fl_dest,
                     select(airports, faa, dest_name = name),
                     c("dest" = "faa"))
head(select(fl_dest, dest_name, everything()), 2)
## # A tibble: 2 x 7
##   dest_name dest  ave_dep_delay ave_arr_delay max_dep_delay max_arr_delay
##   <chr>     <chr>         <dbl>         <dbl>         <dbl>         <dbl>
## 1 Albuquer… ABQ           13.7           4.38           142           153
## 2 Nantucke… ACK            6.46          4.85           219           221
## # … with 1 more variable: n <int>

Now a tooltip can provide a more accessible label:

p <- ggplot(fl_dest) +
    geom_point(aes(x = n, y = ave_arr_delay, text = dest_name))
## Warning: Ignoring unknown aesthetics: text
ggplotly(p, tooltip = "text")

Filtering Joins

Filtering joins only affect rows; they do not add variables.

  • semi_join(x, y) returns all rows in x with a match in y:
  • anti_join(x, y) returns all rows in x without a match in y:

semi_join is useful for matching filtered results back to a table.

The plot of delay against age showed some surprisingly old aircraft:

top_age <- top_n(fl_age, 2, plane_age)
top_age
## # A tibble: 3 x 7
## # Groups:   year [1]
##    year tailnum ave_dep_delay ave_arr_delay     n plane_year plane_age
##   <int> <chr>           <dbl>         <dbl> <int>      <int>     <int>
## 1  2013 N201AA          17.4          14.1     51       1959        54
## 2  2013 N381AA           5.95          3.5     22       1956        57
## 3  2013 N567AA           3.72         -4.30    61       1959        54

We can identify these aircraft with a semi-join:

semi_join(planes, top_age, "tailnum")
## # A tibble: 3 x 9
##   tailnum  year type       manufacturer model   engines seats speed engine 
##   <chr>   <int> <chr>      <chr>        <chr>     <int> <int> <int> <chr>  
## 1 N201AA   1959 Fixed win… CESSNA       150           1     2    90 Recipr…
## 2 N381AA   1956 Fixed win… DOUGLAS      DC-7BF        4   102   232 Recipr…
## 3 N567AA   1959 Fixed win… DEHAVILLAND  OTTER …       1    16    95 Recipr…

The corresponding flights can also be picked out with a semi-join:

top_age_flights <- semi_join(flights, top_age, "tailnum")
head(top_age_flights, 2)
## # A tibble: 2 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     3      909            700       129     1103
## 2  2013     1     3       NA            920        NA       NA
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Looking at the carriers or destinations might help:

head(select(top_age_flights, carrier, dest, everything()), 4)
## # A tibble: 4 x 19
##   carrier dest   year month   day dep_time sched_dep_time dep_delay
##   <chr>   <chr> <int> <int> <int>    <int>          <int>     <dbl>
## 1 AA      ORD    2013     1     3      909            700       129
## 2 AA      DFW    2013     1     3       NA            920        NA
## 3 AA      DFW    2013     1     9     1423           1430        -7
## 4 AA      DFW    2013     1    10     1238           1240        -2
## # … with 11 more variables: arr_time <int>, sched_arr_time <int>,
## #   arr_delay <dbl>, flight <int>, tailnum <chr>, origin <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

The carrier summary:

count(top_age_flights, carrier)
## # A tibble: 1 x 2
##   carrier     n
##   <chr>   <int>
## 1 AA        139

A check of the documentation for planes shows that American and one other airline report fleet numbers rather than tail numbers.

Anti-joins are useful for detecting join problems.

For example, many flights have tail numbers that do not match entries in planes:

fl <- anti_join(flights, planes, "tailnum")
count(fl)
## # A tibble: 1 x 1
##       n
##   <int>
## 1 52606
head(count(fl, tailnum), 4)
## # A tibble: 4 x 2
##   tailnum     n
##   <chr>   <int>
## 1 D942DN      4
## 2 N0EGMQ    371
## 3 N14628      1
## 4 N149AT     22

Join Issues

  • Make sure a primary key you want to use has no missing values.

  • Make sure a primary key you want to use really uniquely identify a row.

  • Make sure every value of your foreign key you want to use to link to another table matches a row in the linked table (anti_join is a good way to do this).

  • Check that your foreign key has no missing values.

Data Base Interface

Current data on airline flights in the US is available from the Bureau of Transportation Statistics at

http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236

The data are available in one compressed CSV file per month.

Loaded into R, one month of data requires about 300 Mb of memory.

Data for 1987–2008 was compiled for a DataExpo competition of the American Statistical Association held in 2008.

The data are available on the Linux systems in the directory /group/statsoft/data/DataExpo09 as compressed CSV files and as a SQLite data base.

Loading the data into R would require about 70 Gb of memory, which is not practical.

Using the data base is a good alternative.

SQLite is a simple relational data base that works with data stored on a local disk.

The first step is to connect to the data base.

library(dplyr)
library(DBI)
library(RSQLite)
db <- dbConnect(SQLite(), "/group/statsoft/data/DataExpo09/ontime.sqlite3")

There is only one table:

dbListTables(db)

tbl creates a tibble-like object for a data base table.

fl <- tbl(db, "ontime")

This object is not a regular tibble, but many tibble operations work on it.

To find the variables in the table:

tbl_vars(fl)

Many dplyr operations are supported:

sm <- summarize(fl, last_year = max(year), n = n())

This computation is fast essentially because it does not actually access the data base.

An operation that needs the data, such as printing the result, will access the data base and take time:

sm

The sm object essentially contains a SQL query, which can be examined using sql_render from the dbplyr package:

dbplyr::sql_render(sm)

Some more examples:

fl_cid <- filter(fl, year == 2008, Dest == "CID")
dbplyr::sql_render(sm)
sm <- summarize(fl_cid, n = n())
dbplyr::sql_render(sm)
sm

sm <- summarize(group_by(fl_cid, Origin), n = n())
dbplyr::sql_render(sm)
sm

sm <- summarize(group_by(fl_cid, Origin),
                ave_arr_del_pos = mean(pmax(ArrDelay, 0), na.rm = TRUE))
dbplyr::sql_render(sm)
sm
sm_cid <- sm

mutatecan also be used:

fl_cid_gain <- mutate(fl_cid, Gain = ArrDelay - DepDelay)
dbplyr::sql_render(fl_cid_gain)
summarize(group_by(fl_cid_gain, Origin), mean(Gain))

Once a filtered result is small enough it can be brought over as a regular tibble using

tb <- collect(fl_cid)

There are some limitations to operations that can be done on tables in data bases:

pos_mean <- function(x) mean(pmax(x, 0))
summarize(group_by(tb, Origin),
                ave_arr_del_pos = pos_mean(ArrDelay))

summarize(group_by(fl_cid, Origin),
          ave_arr_del_pos = pos_mean(ArrDelay))

Some operations may require some modifications:

fl <- tbl(db, "ontime")
fl_cid <- filter(fl, year == 2008, Dest == "CID")
sm_cid <- summarize(group_by(fl_cid, Origin),
                    ave_arr_del_pos = mean(pmax(ArrDelay, 0), na.rm = TRUE))
ap <- select(nycflights13::airports, faa, name)

sm_cid <- left_join(sm_cid, ap, c(Origin = "faa"))

sm_cid <- left_join(sm_cid, ap, c(Origin = "faa"), copy = TRUE)

This copies the ap table to the data base:

dbListTables(db)

Finish by disconnecting from the data base:

dbDisconnect(db)

This is not necessary for SQLite but is for other data bases, so it is good practice.

Some notes:

The dbplyr web site provides more information on using data bases with dplyr.