Jeff Stevens

2023-03-01

# Introduction

## The problem

What’s different between these data sets?

What is needed to create data3 from data1 and data2?

data1
# A tibble: 12 × 3
id  cond  resp
<int> <int> <dbl>
1     1     1 0.223
2     1     2 0.254
3     1     3 0.309
4     2     1 0.143
5     2     2 0.783
6     2     3 0.385
7     3     1 0.779
8     3     2 0.618
9     3     3 0.176
10     4     1 0.791
11     4     2 0.227
12     4     3 0.627
data2
# A tibble: 6 × 2
id   age
<int> <int>
1     1    48
2     2    29
3     3    51
4     4    49
5     5    41
6     6    28
data3
# A tibble: 12 × 4
id   age  cond  resp
<int> <int> <int> <dbl>
1     1    48     1 0.223
2     1    48     2 0.254
3     1    48     3 0.309
4     2    29     1 0.143
5     2    29     2 0.783
6     2    29     3 0.385
7     3    51     1 0.779
8     3    51     2 0.618
9     3    51     3 0.176
10     4    49     1 0.791
11     4    49     2 0.227
12     4    49     3 0.627

## Set-up

library(dplyr)
library(nycflights13)
(flights2 <- select(flights, year:dep_time, carrier, tailnum))
# A tibble: 336,776 × 6
year month   day dep_time carrier tailnum
<int> <int> <int>    <int> <chr>   <chr>
1  2013     1     1      517 UA      N14228
2  2013     1     1      533 UA      N24211
3  2013     1     1      542 AA      N619AA
4  2013     1     1      544 B6      N804JB
5  2013     1     1      554 DL      N668DN
6  2013     1     1      554 UA      N39463
7  2013     1     1      555 B6      N516JB
8  2013     1     1      557 EV      N829AS
9  2013     1     1      557 B6      N593JB
10  2013     1     1      558 AA      N3ALAA
# ℹ 336,766 more rows

## Set-up

(planes2 <- select(planes, tailnum, year, model, seats))
# A tibble: 3,322 × 4
tailnum  year model     seats
<chr>   <int> <chr>     <int>
1 N10156   2004 EMB-145XR    55
2 N102UW   1998 A320-214    182
3 N103US   1999 A320-214    182
4 N104UW   1999 A320-214    182
5 N10575   2002 EMB-145LR    55
6 N105UW   1999 A320-214    182
7 N107US   1999 A320-214    182
8 N108UW   1999 A320-214    182
9 N109UW   1999 A320-214    182
10 N110UW   1999 A320-214    182
# ℹ 3,312 more rows

# Merging data

## Joining with {dplyr}

library(dplyr)

## Merging data

library(nycflights13)

## Keys

• Unique identifiers of observations

• Keys may take some work to clean first

• Double check keys for uniqueness/duplicates

• Create surrogate key if no primary key exists

## Inner joins

x
# A tibble: 3 × 2
key val_x
<dbl> <chr>
1     1 x1
2     2 x2
3     3 x3   
y
# A tibble: 3 × 2
key val_y
<dbl> <chr>
1     1 y1
2     2 y2
3     4 y3   
inner_join(x, y, by = "key")
# A tibble: 2 × 3
key val_x val_y
<dbl> <chr> <chr>
1     1 x1    y1
2     2 x2    y2   
x |>
inner_join(y, by = "key")
# A tibble: 2 × 3
key val_x val_y
<dbl> <chr> <chr>
1     1 x1    y1
2     2 x2    y2   

## Inner joins

glimpse(flights2)
Rows: 336,776
Columns: 6
$year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…$ month    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…$ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 5…
$carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "…$ tailnum  <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "…
glimpse(planes2)
Rows: 3,322
Columns: 4
$tailnum <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575", "N105UW", "N…$ year    <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1999, 1999, 20…
$model <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", "EMB-145LR", …$ seats   <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 55, 55, 55, 55…

## Inner joins

flights2 |>
inner_join(planes2, by = "tailnum")
# A tibble: 284,170 × 9
year.x month   day dep_time carrier tailnum year.y model       seats
<int> <int> <int>    <int> <chr>   <chr>    <int> <chr>       <int>
1   2013     1     1      517 UA      N14228    1999 737-824       149
2   2013     1     1      533 UA      N24211    1998 737-824       149
3   2013     1     1      542 AA      N619AA    1990 757-223       178
4   2013     1     1      544 B6      N804JB    2012 A320-232      200
5   2013     1     1      554 DL      N668DN    1991 757-232       178
6   2013     1     1      554 UA      N39463    2012 737-924ER     191
7   2013     1     1      555 B6      N516JB    2000 A320-232      200
8   2013     1     1      557 EV      N829AS    1998 CL-600-2B19    55
9   2013     1     1      557 B6      N593JB    2004 A320-232      200
10   2013     1     1      558 B6      N793JB    2011 A320-232      200
# ℹ 284,160 more rows

What do you notice about the result?

## Inner joins

Warning

Only use inner joins when you want the intersection of the two data sets!

• Left

• Right

• Full

## Left joins

x
# A tibble: 3 × 2
key val_x
<dbl> <chr>
1     1 x1
2     2 x2
3     3 x3   
y
# A tibble: 3 × 2
key val_y
<dbl> <chr>
1     1 y1
2     2 y2
3     4 y3   
left_join(x, y, by = "key")
# A tibble: 3 × 3
key val_x val_y
<dbl> <chr> <chr>
1     1 x1    y1
2     2 x2    y2
3     3 x3    <NA> 

## Left joins

head(flights2)
# A tibble: 6 × 6
year month   day dep_time carrier tailnum
<int> <int> <int>    <int> <chr>   <chr>
1  2013     1     1      517 UA      N14228
2  2013     1     1      533 UA      N24211
3  2013     1     1      542 AA      N619AA
4  2013     1     1      544 B6      N804JB
5  2013     1     1      554 DL      N668DN
6  2013     1     1      554 UA      N39463 
head(planes2)
# A tibble: 6 × 4
tailnum  year model     seats
<chr>   <int> <chr>     <int>
1 N10156   2004 EMB-145XR    55
2 N102UW   1998 A320-214    182
3 N103US   1999 A320-214    182
4 N104UW   1999 A320-214    182
5 N10575   2002 EMB-145LR    55
6 N105UW   1999 A320-214    182

## Left joins

flights2 |>
left_join(planes2, by = "tailnum")
# A tibble: 336,776 × 9
year.x month   day dep_time carrier tailnum year.y model       seats
<int> <int> <int>    <int> <chr>   <chr>    <int> <chr>       <int>
1   2013     1     1      517 UA      N14228    1999 737-824       149
2   2013     1     1      533 UA      N24211    1998 737-824       149
3   2013     1     1      542 AA      N619AA    1990 757-223       178
4   2013     1     1      544 B6      N804JB    2012 A320-232      200
5   2013     1     1      554 DL      N668DN    1991 757-232       178
6   2013     1     1      554 UA      N39463    2012 737-924ER     191
7   2013     1     1      555 B6      N516JB    2000 A320-232      200
8   2013     1     1      557 EV      N829AS    1998 CL-600-2B19    55
9   2013     1     1      557 B6      N593JB    2004 A320-232      200
10   2013     1     1      558 AA      N3ALAA      NA <NA>           NA
# ℹ 336,766 more rows

Warning

Select columns used to join with argument by

## Inner joins

Otherwise, it uses all shared columns, which may be wrong

flights2 |>
left_join(planes2)
# A tibble: 336,776 × 8
year month   day dep_time carrier tailnum model seats
<int> <int> <int>    <int> <chr>   <chr>   <chr> <int>
1  2013     1     1      517 UA      N14228  <NA>     NA
2  2013     1     1      533 UA      N24211  <NA>     NA
3  2013     1     1      542 AA      N619AA  <NA>     NA
4  2013     1     1      544 B6      N804JB  <NA>     NA
5  2013     1     1      554 DL      N668DN  <NA>     NA
6  2013     1     1      554 UA      N39463  <NA>     NA
7  2013     1     1      555 B6      N516JB  <NA>     NA
8  2013     1     1      557 EV      N829AS  <NA>     NA
9  2013     1     1      557 B6      N593JB  <NA>     NA
10  2013     1     1      558 AA      N3ALAA  <NA>     NA
# ℹ 336,766 more rows

## Right joins

x
# A tibble: 3 × 2
key val_x
<dbl> <chr>
1     1 x1
2     2 x2
3     3 x3   
y
# A tibble: 3 × 2
key val_y
<dbl> <chr>
1     1 y1
2     2 y2
3     4 y3   
right_join(x, y, by = "key")
# A tibble: 3 × 3
key val_x val_y
<dbl> <chr> <chr>
1     1 x1    y1
2     2 x2    y2
3     4 <NA>  y3   

## Full joins

x
# A tibble: 3 × 2
key val_x
<dbl> <chr>
1     1 x1
2     2 x2
3     3 x3   
y
# A tibble: 3 × 2
key val_y
<dbl> <chr>
1     1 y1
2     2 y2
3     4 y3   
full_join(x, y, by = "key")
# A tibble: 4 × 3
key val_x val_y
<dbl> <chr> <chr>
1     1 x1    y1
2     2 x2    y2
3     3 x3    <NA>
4     4 <NA>  y3   

## Solving the problem

What code combines data1 and data2 into data3?

data1
# A tibble: 12 × 3
id  cond  resp
<int> <int> <dbl>
1     1     1 0.223
2     1     2 0.254
3     1     3 0.309
4     2     1 0.143
5     2     2 0.783
6     2     3 0.385
7     3     1 0.779
8     3     2 0.618
9     3     3 0.176
10     4     1 0.791
11     4     2 0.227
12     4     3 0.627
data2
# A tibble: 6 × 2
id   age
<int> <int>
1     1    48
2     2    29
3     3    51
4     4    49
5     5    41
6     6    28
data3
# A tibble: 12 × 4
id   age  cond  resp
<int> <int> <int> <dbl>
1     1    48     1 0.223
2     1    48     2 0.254
3     1    48     3 0.309
4     2    29     1 0.143
5     2    29     2 0.783
6     2    29     3 0.385
7     3    51     1 0.779
8     3    51     2 0.618
9     3    51     3 0.176
10     4    49     1 0.791
11     4    49     2 0.227
12     4    49     3 0.627