Merging columns

Jeff Stevens

2025-02-28

Introduction

The problem

What’s different between these data sets?

What is needed to create data3 from data1 and data2?

data1
# A tibble: 12 × 3
      id  cond  resp
   <int> <int> <dbl>
 1     1     1 0.092
 2     1     2 0.878
 3     1     3 0.449
 4     2     1 0.911
 5     2     2 0.33 
 6     2     3 0.549
 7     3     1 0.797
 8     3     2 0.482
 9     3     3 0.893
10     4     1 0.638
11     4     2 0.316
12     4     3 0.614
data2
# A tibble: 6 × 2
     id   age
  <int> <int>
1     1    41
2     2    26
3     3    29
4     4    46
5     5    42
6     6    41
data3
# A tibble: 12 × 4
      id   age  cond  resp
   <int> <int> <int> <dbl>
 1     1    41     1 0.092
 2     1    41     2 0.878
 3     1    41     3 0.449
 4     2    26     1 0.911
 5     2    26     2 0.33 
 6     2    26     3 0.549
 7     3    29     1 0.797
 8     3    29     2 0.482
 9     3    29     3 0.893
10     4    46     1 0.638
11     4    46     2 0.316
12     4    46     3 0.614

Mental model of merging

Set-up

library(dplyr)
library(nycflights13)
(flights2 <- select(flights, year:day, origin, carrier, tailnum))
# A tibble: 336,776 × 6
    year month   day origin carrier tailnum
   <int> <int> <int> <chr>  <chr>   <chr>  
 1  2013     1     1 EWR    UA      N14228 
 2  2013     1     1 LGA    UA      N24211 
 3  2013     1     1 JFK    AA      N619AA 
 4  2013     1     1 JFK    B6      N804JB 
 5  2013     1     1 LGA    DL      N668DN 
 6  2013     1     1 EWR    UA      N39463 
 7  2013     1     1 EWR    B6      N516JB 
 8  2013     1     1 LGA    EV      N829AS 
 9  2013     1     1 JFK    B6      N593JB 
10  2013     1     1 LGA    AA      N3ALAA 
# ℹ 336,766 more rows

Set-up

(planes2 <- select(planes, tailnum, year, model, seats))
# A tibble: 3,322 × 4
   tailnum  year model     seats
   <chr>   <int> <chr>     <int>
 1 N10156   2004 EMB-145XR    55
 2 N102UW   1998 A320-214    182
 3 N103US   1999 A320-214    182
 4 N104UW   1999 A320-214    182
 5 N10575   2002 EMB-145LR    55
 6 N105UW   1999 A320-214    182
 7 N107US   1999 A320-214    182
 8 N108UW   1999 A320-214    182
 9 N109UW   1999 A320-214    182
10 N110UW   1999 A320-214    182
# ℹ 3,312 more rows

Merging data

Joining with {dplyr}

Merging data

Keys

  • Unique identifiers of observations

  • Keys may take some work to produce

  • Double check keys for uniqueness/duplicates

    • How do we do this?
  • Create surrogate key if no primary key exists

    • Combine columns to make unique key (which function?)
    • Create column of row numbers with row_number() (which function?)

Joins

Mutating joins

Joins columns

Mutating joins

Affects columns

Inner joins

Keep only matching observations

Inner joins

x
# A tibble: 3 × 2
    key val_x
  <dbl> <chr>
1     1 x1   
2     2 x2   
3     3 x3   
y
# A tibble: 3 × 2
    key val_y
  <dbl> <chr>
1     1 y1   
2     2 y2   
3     4 y3   
# A tibble: 2 × 3
    key val_x val_y
  <dbl> <chr> <chr>
1     1 x1    y1   
2     2 x2    y2   
x |> 
  inner_join(y, by = "key")
# A tibble: 2 × 3
    key val_x val_y
  <dbl> <chr> <chr>
1     1 x1    y1   
2     2 x2    y2   

Inner joins

glimpse(flights2)
Rows: 336,776
Columns: 6
$ year    <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 20…
$ month   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ day     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ origin  <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK",…
$ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B…
$ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N…
glimpse(planes2)
Rows: 3,322
Columns: 4
$ tailnum <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575", "N105UW", "N…
$ year    <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1999, 1999, 20…
$ model   <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", "EMB-145LR", …
$ seats   <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 55, 55, 55, 55…

Inner joins

flights2 |>
  inner_join(planes2, by = "tailnum")
# A tibble: 284,170 × 9
   year.x month   day origin carrier tailnum year.y model       seats
    <int> <int> <int> <chr>  <chr>   <chr>    <int> <chr>       <int>
 1   2013     1     1 EWR    UA      N14228    1999 737-824       149
 2   2013     1     1 LGA    UA      N24211    1998 737-824       149
 3   2013     1     1 JFK    AA      N619AA    1990 757-223       178
 4   2013     1     1 JFK    B6      N804JB    2012 A320-232      200
 5   2013     1     1 LGA    DL      N668DN    1991 757-232       178
 6   2013     1     1 EWR    UA      N39463    2012 737-924ER     191
 7   2013     1     1 EWR    B6      N516JB    2000 A320-232      200
 8   2013     1     1 LGA    EV      N829AS    1998 CL-600-2B19    55
 9   2013     1     1 JFK    B6      N593JB    2004 A320-232      200
10   2013     1     1 JFK    B6      N793JB    2011 A320-232      200
# ℹ 284,160 more rows

What do you notice about the result?

Inner joins

Only use inner joins when you want the intersection of the two data sets!

Outer joins

Keep observations that appear in at least one of the tables

  • Left

  • Right

  • Full

Left joins

Keep only left observations

Left joins

x
# A tibble: 3 × 2
    key val_x
  <dbl> <chr>
1     1 x1   
2     2 x2   
3     3 x3   
y
# A tibble: 3 × 2
    key val_y
  <dbl> <chr>
1     1 y1   
2     2 y2   
3     4 y3   
left_join(x, y, by = "key")
# A tibble: 3 × 3
    key val_x val_y
  <dbl> <chr> <chr>
1     1 x1    y1   
2     2 x2    y2   
3     3 x3    <NA> 

Left joins

head(flights2)
# A tibble: 6 × 6
   year month   day origin carrier tailnum
  <int> <int> <int> <chr>  <chr>   <chr>  
1  2013     1     1 EWR    UA      N14228 
2  2013     1     1 LGA    UA      N24211 
3  2013     1     1 JFK    AA      N619AA 
4  2013     1     1 JFK    B6      N804JB 
5  2013     1     1 LGA    DL      N668DN 
6  2013     1     1 EWR    UA      N39463 
head(planes2)
# A tibble: 6 × 4
  tailnum  year model     seats
  <chr>   <int> <chr>     <int>
1 N10156   2004 EMB-145XR    55
2 N102UW   1998 A320-214    182
3 N103US   1999 A320-214    182
4 N104UW   1999 A320-214    182
5 N10575   2002 EMB-145LR    55
6 N105UW   1999 A320-214    182

Left joins

flights2 |>
  left_join(planes2, by = "tailnum")
# A tibble: 336,776 × 9
   year.x month   day origin carrier tailnum year.y model       seats
    <int> <int> <int> <chr>  <chr>   <chr>    <int> <chr>       <int>
 1   2013     1     1 EWR    UA      N14228    1999 737-824       149
 2   2013     1     1 LGA    UA      N24211    1998 737-824       149
 3   2013     1     1 JFK    AA      N619AA    1990 757-223       178
 4   2013     1     1 JFK    B6      N804JB    2012 A320-232      200
 5   2013     1     1 LGA    DL      N668DN    1991 757-232       178
 6   2013     1     1 EWR    UA      N39463    2012 737-924ER     191
 7   2013     1     1 EWR    B6      N516JB    2000 A320-232      200
 8   2013     1     1 LGA    EV      N829AS    1998 CL-600-2B19    55
 9   2013     1     1 JFK    B6      N593JB    2004 A320-232      200
10   2013     1     1 LGA    AA      N3ALAA      NA <NA>           NA
# ℹ 336,766 more rows

Left joins

Select columns used to join with argument by.

Otherwise, it uses all shared columns, which may be wrong.

flights2 |>
  left_join(planes2)
# A tibble: 336,776 × 8
    year month   day origin carrier tailnum model seats
   <int> <int> <int> <chr>  <chr>   <chr>   <chr> <int>
 1  2013     1     1 EWR    UA      N14228  <NA>     NA
 2  2013     1     1 LGA    UA      N24211  <NA>     NA
 3  2013     1     1 JFK    AA      N619AA  <NA>     NA
 4  2013     1     1 JFK    B6      N804JB  <NA>     NA
 5  2013     1     1 LGA    DL      N668DN  <NA>     NA
 6  2013     1     1 EWR    UA      N39463  <NA>     NA
 7  2013     1     1 EWR    B6      N516JB  <NA>     NA
 8  2013     1     1 LGA    EV      N829AS  <NA>     NA
 9  2013     1     1 JFK    B6      N593JB  <NA>     NA
10  2013     1     1 LGA    AA      N3ALAA  <NA>     NA
# ℹ 336,766 more rows

Right joins

Keep only right observations

Right joins

x
# A tibble: 3 × 2
    key val_x
  <dbl> <chr>
1     1 x1   
2     2 x2   
3     3 x3   
y
# A tibble: 3 × 2
    key val_y
  <dbl> <chr>
1     1 y1   
2     2 y2   
3     4 y3   
right_join(x, y, by = "key")
# A tibble: 3 × 3
    key val_x val_y
  <dbl> <chr> <chr>
1     1 x1    y1   
2     2 x2    y2   
3     4 <NA>  y3   

Full joins

Keep all observations

Full joins

x
# A tibble: 3 × 2
    key val_x
  <dbl> <chr>
1     1 x1   
2     2 x2   
3     3 x3   
y
# A tibble: 3 × 2
    key val_y
  <dbl> <chr>
1     1 y1   
2     2 y2   
3     4 y3   
full_join(x, y, by = "key")
# A tibble: 4 × 3
    key val_x val_y
  <dbl> <chr> <chr>
1     1 x1    y1   
2     2 x2    y2   
3     3 x3    <NA> 
4     4 <NA>  y3   

Different column names

What if datasets have different column names for the same key?

glimpse(flights2)
Rows: 336,776
Columns: 6
$ year    <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 20…
$ month   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ day     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ origin  <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK",…
$ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B…
$ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N…
airports2 <- select(airports, faa, name)
glimpse(airports2)
Rows: 1,458
Columns: 2
$ faa  <chr> "04G", "06A", "06C", "06N", "09J", "0A9", "0G6", "0G7", "0P2", "0…
$ name <chr> "Lansdowne Airport", "Moton Field Municipal Airport", "Schaumburg…

Different column names

join_by()

left_join(flights2, airports2, by = join_by(origin == faa))
# A tibble: 336,776 × 7
    year month   day origin carrier tailnum name               
   <int> <int> <int> <chr>  <chr>   <chr>   <chr>              
 1  2013     1     1 EWR    UA      N14228  Newark Liberty Intl
 2  2013     1     1 LGA    UA      N24211  La Guardia         
 3  2013     1     1 JFK    AA      N619AA  John F Kennedy Intl
 4  2013     1     1 JFK    B6      N804JB  John F Kennedy Intl
 5  2013     1     1 LGA    DL      N668DN  La Guardia         
 6  2013     1     1 EWR    UA      N39463  Newark Liberty Intl
 7  2013     1     1 EWR    B6      N516JB  Newark Liberty Intl
 8  2013     1     1 LGA    EV      N829AS  La Guardia         
 9  2013     1     1 JFK    B6      N593JB  John F Kennedy Intl
10  2013     1     1 LGA    AA      N3ALAA  La Guardia         
# ℹ 336,766 more rows

Solving the problem

library(tidyverse)
nrows <- 12
set.seed(20250228)
data1 <- tibble(id = rep(1:4, each = 3), 
                cond = rep(1:3, times = 4), 
                resp = round(runif(nrows), 3))
data2 <- tibble(id = 1:6, 
                age = sample(19:75, 6, replace = TRUE))

Solving the problem

What code combines data1 and data2 into data3?

data1
# A tibble: 12 × 3
      id  cond  resp
   <int> <int> <dbl>
 1     1     1 0.092
 2     1     2 0.878
 3     1     3 0.449
 4     2     1 0.911
 5     2     2 0.33 
 6     2     3 0.549
 7     3     1 0.797
 8     3     2 0.482
 9     3     3 0.893
10     4     1 0.638
11     4     2 0.316
12     4     3 0.614
data2
# A tibble: 6 × 2
     id   age
  <int> <int>
1     1    41
2     2    26
3     3    29
4     4    46
5     5    42
6     6    41
data3
# A tibble: 12 × 4
      id   age  cond  resp
   <int> <int> <int> <dbl>
 1     1    41     1 0.092
 2     1    41     2 0.878
 3     1    41     3 0.449
 4     2    26     1 0.911
 5     2    26     2 0.33 
 6     2    26     3 0.549
 7     3    29     1 0.797
 8     3    29     2 0.482
 9     3    29     3 0.893
10     4    46     1 0.638
11     4    46     2 0.316
12     4    46     3 0.614

Let’s code!

Merging columns