Jeff Stevens

2023-03-06

# Introduction

## The problem

What’s different between these data sets?

What is needed to create data2 from data1?

data1
# A tibble: 12 × 3
val1   val2    val3
<dbl>  <dbl>   <dbl>
1 0.773  0.470  0.00431
2 0.827  0.751  0.00923
3 0.746  0.220  0.00814
4 0.953  0.199  0.00767
5 0.298  0.894  0.00221
6 0.860  0.0149 0.00499
7 0.0460 0.956  0.00779
8 0.947  0.162  0.00875
9 0.511  0.189  0.00986
10 0.712  0.0969 0.00862
11 0.944  0.370  0.00209
12 0.834  0.585  0.00420
data2
# A tibble: 12 × 3
percent_val1 log_val2 val3
<dbl>    <dbl> <chr>
1         77.3  -0.756  4.3e-03
2         82.7  -0.287  9.2e-03
3         74.6  -1.51   8.1e-03
4         95.3  -1.61   7.7e-03
5         29.8  -0.113  2.2e-03
6         86.0  -4.20   5.0e-03
7          4.6  -0.0452 7.8e-03
8         94.7  -1.82   8.7e-03
9         51.1  -1.67   9.9e-03
10         71.2  -2.33   8.6e-03
11         94.4  -0.994  2.1e-03
12         83.4  -0.536  4.2e-03

## Set-up

library(tidyverse)
library(nycflights13)

## Types of numbers

Doubles are floating point numbers

Note

Floating point number: a number without a fixed number of digits after the decimal point

Floating point numbers ≈ scientific notation

## Types of numbers

Computer memory is limited, so you cannot store numbers with infinite precision and floating points are stored imprecisely

sqrt(2) ^ 2 == 2
[1] FALSE
1 / 49 * 49 == 1
[1] FALSE

# Comparing and counting

## Comparing numbers

x <- 1
y <- 1.00000000000001
z <- 1.001
x == y
[1] FALSE
all.equal(x, y)
[1] TRUE
all.equal(x, z)
[1] "Mean relative difference: 0.001"
all.equal(x, z, tolerance = 1e-2)
[1] TRUE
sqrt(2) ^ 2 == 2
[1] FALSE
all.equal(sqrt(2) ^ 2, 2)
[1] TRUE
dplyr::near(sqrt(2) ^ 2, 2)
[1] TRUE

## Counts

As a reminder, we’ve already seen how to use dplyr::count()

count(flights, carrier)
# A tibble: 16 × 2
carrier     n
<chr>   <int>
1 9E      18460
2 AA      32729
3 AS        714
4 B6      54635
5 DL      48110
6 EV      54173
7 F9        685
8 FL       3260
9 HA        342
10 MQ      26397
11 OO         32
12 UA      58665
13 US      20536
14 VX       5162
15 WN      12275
16 YV        601

## Counts

We can also automatically sort by count.

count(flights, carrier, sort = TRUE)
# A tibble: 16 × 2
carrier     n
<chr>   <int>
1 UA      58665
2 B6      54635
3 EV      54173
4 DL      48110
5 AA      32729
6 MQ      26397
7 US      20536
8 9E      18460
9 WN      12275
10 VX       5162
11 FL       3260
12 AS        714
13 F9        685
14 YV        601
15 HA        342
16 OO         32

And sum up totals instead of just count

count(flights, carrier, wt = distance)
# A tibble: 16 × 2
carrier        n
<chr>      <dbl>
1 9E       9788152
2 AA      43864584
3 AS       1715028
4 B6      58384137
5 DL      59507317
6 EV      30498951
7 F9       1109700
8 FL       2167344
9 HA       1704186
10 MQ      15033955
11 OO         16026
12 UA      89705524
13 US      11365778
14 VX      12902327
15 WN      12229203
16 YV        225395

## Counts

Remember n() counts inside a summarise()

flights |>
group_by(carrier) |>
summarise(n = n())
# A tibble: 16 × 2
carrier     n
<chr>   <int>
1 9E      18460
2 AA      32729
3 AS        714
4 B6      54635
5 DL      48110
6 EV      54173
7 F9        685
8 FL       3260
9 HA        342
10 MQ      26397
11 OO         32
12 UA      58665
13 US      20536
14 VX       5162
15 WN      12275
16 YV        601

n_distinct() counts instances within a group

flights |>
group_by(dest) |>
summarise(carriers = n_distinct(carrier))
# A tibble: 105 × 2
dest  carriers
<chr>    <int>
1 ABQ          1
2 ACK          1
3 ALB          1
4 ANC          1
5 ATL          7
6 AUS          6
7 AVL          2
8 BDL          2
9 BGR          2
10 BHM          1
# ℹ 95 more rows

## Counting NAs

To count NAs, you can sum() up TRUE responses to is.na()

flights |>
group_by(dest) |>
summarize(n_cancelled = sum(is.na(dep_time)))
# A tibble: 105 × 2
dest  n_cancelled
<chr>       <int>
1 ABQ             0
2 ACK             0
3 ALB            20
4 ANC             0
5 ATL           317
6 AUS            21
7 AVL            12
8 BDL            31
9 BGR            15
10 BHM            25
# ℹ 95 more rows

## Counting NAs

This trick can be used for any logical vector

sum(flights\$month == 1)
[1] 27004
nrow(filter(flights, month == 1))
[1] 27004

# Transforming numbers

## Operations

Mathematical operators are recycled across all elements in a vector

0:10 * 5
 [1]  0  5 10 15 20 25 30 35 40 45 50

You can operate with vectors > 1, but the larger vector must be a multiple of the smaller vector

0:10 * c(5, 6)
Warning in 0:10 * c(5, 6): longer object length is not a multiple of shorter
object length
 [1]  0  6 10 18 20 30 30 42 40 54 50
0:11 * c(5, 6)
 [1]  0  6 10 18 20 30 30 42 40 54 50 66

## Mathematical transformations

• sqrt()
• log()
• log10()
• log2()
• sin()
• asin()
sqrt(seq(0, 100, 10))
 [1]  0.000000  3.162278  4.472136  5.477226  6.324555  7.071068  7.745967
[8]  8.366600  8.944272  9.486833 10.000000
log(runif(10))
 [1] -0.3921567 -0.3017750 -2.4044192 -0.6672081 -0.3630729 -1.9276563
[7] -1.0672688 -4.2340097 -0.8112143 -1.0013072
asin(sqrt(runif(10)))
 [1] 0.8077234 1.1635879 0.1599809 0.8139120 0.5528058 0.6495491 1.2270613
[8] 0.9685195 0.7673122 0.8240033

## Rounding

Control significant digits with round()

round(123.456, 2)  # two digits
[1] 123.46
round(123.456, 1)  # one digit
[1] 123.5
round(123.456) # whole number
[1] 123
round(123.456, -1) # round to nearest ten
[1] 120
round(123.456, -2) # round to nearest hundred
[1] 100

# Formatting numbers

## Formatting

When numbers get too big, too small, or need other formatting, use format()

(x <- 0.0020)
[1] 0.002
format(x, scientific = TRUE)
[1] "2e-03"
format(x, nsmall = 4)
[1] "0.0020"
(y <- 12345678.9)
[1] 12345679
format(y, scientific = TRUE,
digits = 3)
[1] "1.23e+07"
format(y, big.mark = ",")
[1] "12,345,679"

## Cutting numbers into ranges

If you need to bin numbers into ranges, use cut()

set.seed(1)
(x <- runif(12, min = 0, max = 100))
 [1] 26.550866 37.212390 57.285336 90.820779 20.168193 89.838968 94.467527
[8] 66.079779 62.911404  6.178627 20.597457 17.655675
cut(x, breaks = c(0, 33, 66, 100))
 [1] (0,33]   (33,66]  (33,66]  (66,100] (0,33]   (66,100] (66,100] (66,100]
[9] (33,66]  (0,33]   (0,33]   (0,33]
Levels: (0,33] (33,66] (66,100]
cut(x, breaks = c(0, 33, 66, 100), labels = c("Low", "Medium", "High"))
 [1] Low    Medium Medium High   Low    High   High   High   Medium Low
[11] Low    Low
Levels: Low Medium High

## Solving the problem

data1
# A tibble: 12 × 3
val1   val2    val3
<dbl>  <dbl>   <dbl>
1 0.773  0.470  0.00431
2 0.827  0.751  0.00923
3 0.746  0.220  0.00814
4 0.953  0.199  0.00767
5 0.298  0.894  0.00221
6 0.860  0.0149 0.00499
7 0.0460 0.956  0.00779
8 0.947  0.162  0.00875
9 0.511  0.189  0.00986
10 0.712  0.0969 0.00862
11 0.944  0.370  0.00209
12 0.834  0.585  0.00420
data2
# A tibble: 12 × 3
percent_val1 log_val2 val3
<dbl>    <dbl> <chr>
1         77.3  -0.756  4.3e-03
2         82.7  -0.287  9.2e-03
3         74.6  -1.51   8.1e-03
4         95.3  -1.61   7.7e-03
5         29.8  -0.113  2.2e-03
6         86.0  -4.20   5.0e-03
7          4.6  -0.0452 7.8e-03
8         94.7  -1.82   8.7e-03
9         51.1  -1.67   9.9e-03
10         71.2  -2.33   8.6e-03
11         94.4  -0.994  2.1e-03
12         83.4  -0.536  4.2e-03