Jeff Stevens

2023-02-20

# Review

## The problem

What’s different between these data sets? What can we use to create data2 from data1?

data1
  cond id       date
1    3  1 2022-02-13
2    2  2 2022-01-28
3    3  3 2022-01-11
4    2  4 2022-01-10
5    1  5 2022-01-22
6    1  6 2022-01-01
7    2  7 2022-01-14
8    1  8 2022-01-26
data2
         cond id    month       date
1 Condition 3  1 February 2022-02-13
2 Condition 2  2  January 2022-01-28
3 Condition 3  3  January 2022-01-11
4 Condition 2  4  January 2022-01-10
5 Condition 1  5  January 2022-01-22
6 Condition 1  6  January 2022-01-01
7 Condition 2  7  January 2022-01-14
8 Condition 1  8  January 2022-01-26

## The problem

What’s different between these data sets? What can we use to create data3 from data1?

data1
  cond id       date
1    3  1 2022-02-13
2    2  2 2022-01-28
3    3  3 2022-01-11
4    2  4 2022-01-10
5    1  5 2022-01-22
6    1  6 2022-01-01
7    2  7 2022-01-14
8    1  8 2022-01-26
data3
  id cond
1  1    3
2  2    2
3  3    3
4  4    2
5  5    1
6  6    1
7  7    2
8  8    1

## The problem

What’s different between these data sets? What needs to happen to create data4 from data1?

data1
  cond id       date
1    3  1 2022-02-13
2    2  2 2022-01-28
3    3  3 2022-01-11
4    2  4 2022-01-10
5    1  5 2022-01-22
6    1  6 2022-01-01
7    2  7 2022-01-14
8    1  8 2022-01-26
data4
  cond id       date
1    1  6 2022-01-01
2    2  4 2022-01-10
3    2  7 2022-01-14
4    1  5 2022-01-22
5    1  8 2022-01-26
6    2  2 2022-01-28

## Set-up

library(dplyr)
library(nycflights13)
(flights2 <- select(flights, year:dep_delay, air_time))
# A tibble: 336,776 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013     1     1      517            515         2      227
2  2013     1     1      533            529         4      227
3  2013     1     1      542            540         2      160
4  2013     1     1      544            545        -1      183
5  2013     1     1      554            600        -6      116
6  2013     1     1      554            558        -4      150
7  2013     1     1      555            600        -5      158
8  2013     1     1      557            600        -3       53
9  2013     1     1      557            600        -3      140
10  2013     1     1      558            600        -2      138
# ℹ 336,766 more rows

# Filtering rows

## filter()

filter(flights2, dep_time == sched_dep_time)
# A tibble: 16,514 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013     1     1      559            559         0       44
2  2013     1     1      600            600         0      152
3  2013     1     1      600            600         0      134
4  2013     1     1      607            607         0      157
5  2013     1     1      615            615         0      182
6  2013     1     1      615            615         0      120
7  2013     1     1      635            635         0      248
8  2013     1     1      655            655         0      294
9  2013     1     1      739            739         0      249
10  2013     1     1      745            745         0      378
# ℹ 16,504 more rows

## filter()

### Multiple conditions

filter(flights2, dep_delay < 0 & month == 2)
# A tibble: 13,397 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013     2     1      456            500        -4       98
2  2013     2     1      520            525        -5      209
3  2013     2     1      527            530        -3      233
4  2013     2     1      532            540        -8      195
5  2013     2     1      552            600        -8       58
6  2013     2     1      552            600        -8      227
7  2013     2     1      552            600        -8       42
8  2013     2     1      553            600        -7      134
9  2013     2     1      553            600        -7      125
10  2013     2     1      553            600        -7       49
# ℹ 13,387 more rows

## filter()

### Logical OR

filter(flights2, month == 11 | month == 12)
# A tibble: 55,403 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013    11     1        5           2359         6      205
2  2013    11     1       35           2250       105       36
3  2013    11     1      455            500        -5       88
4  2013    11     1      539            545        -6      229
5  2013    11     1      542            545        -3      147
6  2013    11     1      549            600       -11      359
7  2013    11     1      550            600       -10       57
8  2013    11     1      554            600        -6       40
9  2013    11     1      554            600        -6      126
10  2013    11     1      554            600        -6       93
# ℹ 55,393 more rows

## filter()

### Logical %in%

filter(flights2, month %in% c(11, 12))
# A tibble: 55,403 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013    11     1        5           2359         6      205
2  2013    11     1       35           2250       105       36
3  2013    11     1      455            500        -5       88
4  2013    11     1      539            545        -6      229
5  2013    11     1      542            545        -3      147
6  2013    11     1      549            600       -11      359
7  2013    11     1      550            600       -10       57
8  2013    11     1      554            600        -6       40
9  2013    11     1      554            600        -6      126
10  2013    11     1      554            600        -6       93
# ℹ 55,393 more rows

## filter()

### Negating conditional

filter(flights2, month != 1)
# A tibble: 309,772 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013    10     1      447            500       -13       69
2  2013    10     1      522            517         5      174
3  2013    10     1      536            545        -9      132
4  2013    10     1      539            545        -6      172
5  2013    10     1      539            545        -6      186
6  2013    10     1      544            550        -6      191
7  2013    10     1      549            600       -11       46
8  2013    10     1      550            600       -10       38
9  2013    10     1      550            600       -10       39
10  2013    10     1      551            600        -9      117
# ℹ 309,762 more rows

## filter()

### Negating multiple conditionals

filter(flights2, month != 1 & day != 1)
# A tibble: 299,578 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013    10     2      449            500       -11       73
2  2013    10     2      513            517        -4      180
3  2013    10     2      537            545        -8      190
4  2013    10     2      540            545        -5      181
5  2013    10     2      543            545        -2      134
6  2013    10     2      546            550        -4      192
7  2013    10     2      548            600       -12       48
8  2013    10     2      548            600       -12       42
9  2013    10     2      550            600       -10       32
10  2013    10     2      552            600        -8       40
# ℹ 299,568 more rows

## filter()

### Check if NA with is.na()

filter(flights2, is.na(dep_time))
# A tibble: 8,255 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013     1     1       NA           1630        NA       NA
2  2013     1     1       NA           1935        NA       NA
3  2013     1     1       NA           1500        NA       NA
4  2013     1     1       NA            600        NA       NA
5  2013     1     2       NA           1540        NA       NA
6  2013     1     2       NA           1620        NA       NA
7  2013     1     2       NA           1355        NA       NA
8  2013     1     2       NA           1420        NA       NA
9  2013     1     2       NA           1321        NA       NA
10  2013     1     2       NA           1545        NA       NA
# ℹ 8,245 more rows

## filter()

### Check if NA across multiple columns with is.na() and if_any()

filter(flights2, if_any(everything(), is.na))
# A tibble: 9,430 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013     1     1     1525           1530        -5       NA
2  2013     1     1     1528           1459        29       NA
3  2013     1     1     1740           1745        -5       NA
4  2013     1     1     1807           1738        29       NA
5  2013     1     1     1939           1840        59       NA
6  2013     1     1     1952           1930        22       NA
7  2013     1     1     2016           1930        46       NA
8  2013     1     1       NA           1630        NA       NA
9  2013     1     1       NA           1935        NA       NA
10  2013     1     1       NA           1500        NA       NA
# ℹ 9,420 more rows

## drop_na()

### Filter column with anyNAs with drop_na()

nrow(flights2)
[1] 336776
nrow(drop_na(flights2))
[1] 327346
nrow(drop_na(flights, dep_time))
[1] 328521

# Sorting rows

## arrange()

arrange(flights2, sched_dep_time)
# A tibble: 336,776 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013     7    27       NA            106        NA       NA
2  2013     1     2      458            500        -2      108
3  2013     1     3      458            500        -2       94
4  2013     1     4      456            500        -4       77
5  2013     1     5      458            500        -2       85
6  2013     1     6      458            500        -2       88
7  2013     1     7      454            500        -6       86
8  2013     1     8      454            500        -6       77
9  2013     1     9      457            500        -3       87
10  2013     1    10      450            500       -10       78
# ℹ 336,766 more rows

## arrange()

### Sort multiple rows

arrange(flights2, sched_dep_time, dep_time)
# A tibble: 336,776 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013     7    27       NA            106        NA       NA
2  2013     5     8      445            500       -15       78
3  2013     5     5      446            500       -14       90
4  2013     9     4      446            500       -14       79
5  2013    10     1      447            500       -13       69
6  2013     9    19      447            500       -13       81
7  2013     1    29      448            500       -12       88
8  2013    12    27      448            500       -12       92
9  2013     5     7      448            500       -12       80
10  2013    10     2      449            500       -11       73
# ℹ 336,766 more rows

## arrange()

### Sort in descending order

arrange(flights2, desc(dep_time))
# A tibble: 336,776 × 7
year month   day dep_time sched_dep_time dep_delay air_time
<int> <int> <int>    <int>          <int>     <dbl>    <dbl>
1  2013    10    30     2400           2359         1      182
2  2013    11    27     2400           2359         1      230
3  2013    12     5     2400           2359         1      182
4  2013    12     9     2400           2359         1      195
5  2013    12     9     2400           2250        70       41
6  2013    12    13     2400           2359         1      192
7  2013    12    19     2400           2359         1      193
8  2013    12    29     2400           1700       420      161
9  2013     2     7     2400           2359         1      186
10  2013     2     7     2400           2359         1      194
# ℹ 336,766 more rows

## Solving the problem

What code turns data1 into data4?

data1
  cond id       date
1    3  1 2022-02-13
2    2  2 2022-01-28
3    3  3 2022-01-11
4    2  4 2022-01-10
5    1  5 2022-01-22
6    1  6 2022-01-01
7    2  7 2022-01-14
8    1  8 2022-01-26
data4
  cond id       date
1    1  6 2022-01-01
2    2  4 2022-01-10
3    2  7 2022-01-14
4    1  5 2022-01-22
5    1  8 2022-01-26
6    2  2 2022-01-28