data1
cond id date
1 2 1 2022-02-03
2 3 2 2022-02-09
3 3 3 2022-02-15
4 1 4 2022-02-02
5 2 5 2022-02-20
6 2 6 2022-02-18
7 2 7 2022-01-18
8 1 8 2022-01-17
2023-02-20
What’s different between these data sets? What can we use to create data2
from data1
?
data1
cond id date
1 2 1 2022-02-03
2 3 2 2022-02-09
3 3 3 2022-02-15
4 1 4 2022-02-02
5 2 5 2022-02-20
6 2 6 2022-02-18
7 2 7 2022-01-18
8 1 8 2022-01-17
data2
cond id month date
1 Condition 2 1 February 2022-02-03
2 Condition 3 2 February 2022-02-09
3 Condition 3 3 February 2022-02-15
4 Condition 1 4 February 2022-02-02
5 Condition 2 5 February 2022-02-20
6 Condition 2 6 February 2022-02-18
7 Condition 2 7 January 2022-01-18
8 Condition 1 8 January 2022-01-17
What’s different between these data sets? What can we use to create data3
from data1
?
data1
cond id date
1 2 1 2022-02-03
2 3 2 2022-02-09
3 3 3 2022-02-15
4 1 4 2022-02-02
5 2 5 2022-02-20
6 2 6 2022-02-18
7 2 7 2022-01-18
8 1 8 2022-01-17
data3
id cond
1 1 2
2 2 3
3 3 3
4 4 1
5 5 2
6 6 2
7 7 2
8 8 1
What’s different between these data sets? What needs to happen to create data4
from data1
?
data1
cond id date
1 2 1 2022-02-03
2 3 2 2022-02-09
3 3 3 2022-02-15
4 1 4 2022-02-02
5 2 5 2022-02-20
6 2 6 2022-02-18
7 2 7 2022-01-18
8 1 8 2022-01-17
data4
cond id date
1 1 8 2022-01-17
2 2 7 2022-01-18
3 1 4 2022-02-02
4 2 1 2022-02-03
5 2 6 2022-02-18
6 2 5 2022-02-20
library(dplyr)
library(nycflights13)
(flights2 <- select(flights, year:dep_delay, air_time))
# A tibble: 336,776 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 1 1 517 515 2 227
2 2013 1 1 533 529 4 227
3 2013 1 1 542 540 2 160
4 2013 1 1 544 545 -1 183
5 2013 1 1 554 600 -6 116
6 2013 1 1 554 558 -4 150
7 2013 1 1 555 600 -5 158
8 2013 1 1 557 600 -3 53
9 2013 1 1 557 600 -3 140
10 2013 1 1 558 600 -2 138
# ℹ 336,766 more rows
filter()
filter()
filter(flights2, dep_time == sched_dep_time)
# A tibble: 16,514 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 1 1 559 559 0 44
2 2013 1 1 600 600 0 152
3 2013 1 1 600 600 0 134
4 2013 1 1 607 607 0 157
5 2013 1 1 615 615 0 182
6 2013 1 1 615 615 0 120
7 2013 1 1 635 635 0 248
8 2013 1 1 655 655 0 294
9 2013 1 1 739 739 0 249
10 2013 1 1 745 745 0 378
# ℹ 16,504 more rows
filter()
filter(flights2, dep_delay < 0 & month == 2)
# A tibble: 13,397 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 2 1 456 500 -4 98
2 2013 2 1 520 525 -5 209
3 2013 2 1 527 530 -3 233
4 2013 2 1 532 540 -8 195
5 2013 2 1 552 600 -8 58
6 2013 2 1 552 600 -8 227
7 2013 2 1 552 600 -8 42
8 2013 2 1 553 600 -7 134
9 2013 2 1 553 600 -7 125
10 2013 2 1 553 600 -7 49
# ℹ 13,387 more rows
filter()
filter(flights2, month == 11 | month == 12)
# A tibble: 55,403 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 11 1 5 2359 6 205
2 2013 11 1 35 2250 105 36
3 2013 11 1 455 500 -5 88
4 2013 11 1 539 545 -6 229
5 2013 11 1 542 545 -3 147
6 2013 11 1 549 600 -11 359
7 2013 11 1 550 600 -10 57
8 2013 11 1 554 600 -6 40
9 2013 11 1 554 600 -6 126
10 2013 11 1 554 600 -6 93
# ℹ 55,393 more rows
filter()
# A tibble: 55,403 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 11 1 5 2359 6 205
2 2013 11 1 35 2250 105 36
3 2013 11 1 455 500 -5 88
4 2013 11 1 539 545 -6 229
5 2013 11 1 542 545 -3 147
6 2013 11 1 549 600 -11 359
7 2013 11 1 550 600 -10 57
8 2013 11 1 554 600 -6 40
9 2013 11 1 554 600 -6 126
10 2013 11 1 554 600 -6 93
# ℹ 55,393 more rows
filter()
filter(flights2, month != 1)
# A tibble: 309,772 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 10 1 447 500 -13 69
2 2013 10 1 522 517 5 174
3 2013 10 1 536 545 -9 132
4 2013 10 1 539 545 -6 172
5 2013 10 1 539 545 -6 186
6 2013 10 1 544 550 -6 191
7 2013 10 1 549 600 -11 46
8 2013 10 1 550 600 -10 38
9 2013 10 1 550 600 -10 39
10 2013 10 1 551 600 -9 117
# ℹ 309,762 more rows
filter()
filter(flights2, month != 1 & day != 1)
# A tibble: 299,578 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 10 2 449 500 -11 73
2 2013 10 2 513 517 -4 180
3 2013 10 2 537 545 -8 190
4 2013 10 2 540 545 -5 181
5 2013 10 2 543 545 -2 134
6 2013 10 2 546 550 -4 192
7 2013 10 2 548 600 -12 48
8 2013 10 2 548 600 -12 42
9 2013 10 2 550 600 -10 32
10 2013 10 2 552 600 -8 40
# ℹ 299,568 more rows
filter()
NA
with is.na()
# A tibble: 8,255 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 1 1 NA 1630 NA NA
2 2013 1 1 NA 1935 NA NA
3 2013 1 1 NA 1500 NA NA
4 2013 1 1 NA 600 NA NA
5 2013 1 2 NA 1540 NA NA
6 2013 1 2 NA 1620 NA NA
7 2013 1 2 NA 1355 NA NA
8 2013 1 2 NA 1420 NA NA
9 2013 1 2 NA 1321 NA NA
10 2013 1 2 NA 1545 NA NA
# ℹ 8,245 more rows
filter()
NA
across multiple columns with is.na()
and if_any()
filter(flights2, if_any(everything(), is.na))
# A tibble: 9,430 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 1 1 1525 1530 -5 NA
2 2013 1 1 1528 1459 29 NA
3 2013 1 1 1740 1745 -5 NA
4 2013 1 1 1807 1738 29 NA
5 2013 1 1 1939 1840 59 NA
6 2013 1 1 1952 1930 22 NA
7 2013 1 1 2016 1930 46 NA
8 2013 1 1 NA 1630 NA NA
9 2013 1 1 NA 1935 NA NA
10 2013 1 1 NA 1500 NA NA
# ℹ 9,420 more rows
drop_na()
NA
s with drop_na()
nrow(drop_na(flights, dep_time))
[1] 328521
arrange()
arrange()
arrange(flights2, sched_dep_time)
# A tibble: 336,776 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 7 27 NA 106 NA NA
2 2013 1 2 458 500 -2 108
3 2013 1 3 458 500 -2 94
4 2013 1 4 456 500 -4 77
5 2013 1 5 458 500 -2 85
6 2013 1 6 458 500 -2 88
7 2013 1 7 454 500 -6 86
8 2013 1 8 454 500 -6 77
9 2013 1 9 457 500 -3 87
10 2013 1 10 450 500 -10 78
# ℹ 336,766 more rows
arrange()
arrange(flights2, sched_dep_time, dep_time)
# A tibble: 336,776 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 7 27 NA 106 NA NA
2 2013 5 8 445 500 -15 78
3 2013 5 5 446 500 -14 90
4 2013 9 4 446 500 -14 79
5 2013 10 1 447 500 -13 69
6 2013 9 19 447 500 -13 81
7 2013 1 29 448 500 -12 88
8 2013 12 27 448 500 -12 92
9 2013 5 7 448 500 -12 80
10 2013 10 2 449 500 -11 73
# ℹ 336,766 more rows
arrange()
# A tibble: 336,776 × 7
year month day dep_time sched_dep_time dep_delay air_time
<int> <int> <int> <int> <int> <dbl> <dbl>
1 2013 10 30 2400 2359 1 182
2 2013 11 27 2400 2359 1 230
3 2013 12 5 2400 2359 1 182
4 2013 12 9 2400 2359 1 195
5 2013 12 9 2400 2250 70 41
6 2013 12 13 2400 2359 1 192
7 2013 12 19 2400 2359 1 193
8 2013 12 29 2400 1700 420 161
9 2013 2 7 2400 2359 1 186
10 2013 2 7 2400 2359 1 194
# ℹ 336,766 more rows
What code turns data1
into data4
?
data1
cond id date
1 2 1 2022-02-03
2 3 2 2022-02-09
3 3 3 2022-02-15
4 1 4 2022-02-02
5 2 5 2022-02-20
6 2 6 2022-02-18
7 2 7 2022-01-18
8 1 8 2022-01-17
data4
cond id date
1 1 8 2022-01-17
2 2 7 2022-01-18
3 1 4 2022-02-02
4 2 1 2022-02-03
5 2 6 2022-02-18
6 2 5 2022-02-20