Chapter 3 R programming

This chapter is about base R stuff that I find important and that is often overlooked or unknown to most R users.

3.1 Common mistakes

If you are using R and you think you’re in hell, this is a map for you.

– Patrick Burns

3.1.1 Equality

(0.1 + 0.2) == 0.3

#> [1] FALSE

print(c(0.1, 0.2, 0.3), digits = 20)

#> [1] 0.10000000000000000555 0.20000000000000001110 0.29999999999999998890

all.equal(0.1 + 0.2, 0.3)  ## equality with some tolerance

#> [1] TRUE

all.equal(0.1 + 0.2, 0.3, tolerance = 0)

#> [1] "Mean relative difference: 1.850372e-16"

all.equal(0.1 + 0.2, 0.4)

#> [1] "Mean relative difference: 0.3333333"

isTRUE(all.equal(0.1 + 0.2, 0.4))  ## if you want a boolean, use isTRUE()

#> [1] FALSE

dplyr::near(0.1 + 0.2, 0.3)  ## similar, from the {dplyr} package

#> [1] TRUE

3.1.2 Arguments

min(-1, 5, 118)

#> [1] -1

max(-1, 5, 118)

#> [1] 118

mean(-1, 5, 118)

#> [1] -1

median(-1, 5, 118)

#> [1] -1

How to explain the issue with mean and median? Let us look at the parameters of these functions:

args(max)

#> function (..., na.rm = FALSE) 
#> NULL

args(mean)

#> function (x, ...) 
#> NULL

args(median)

#> function (x, na.rm = FALSE, ...) 
#> NULL

One solution is to always use a vector:

min(c(-1, 5, 118))

#> [1] -1

max(c(-1, 5, 118))

#> [1] 118

mean(c(-1, 5, 118))

#> [1] 40.66667

median(c(-1, 5, 118))

#> [1] 5

3.1.3 Others

sample(1:10)

#>  [1]  8  5  2  7  6  3  4  9  1 10

sample(10)

#>  [1]  1 10  4  6  3  2  9  8  5  7

sample(10.1)

#>  [1]  3  2 10  9  8  7  4  5  1  6

n <- 10
1:n-1  ## is (1:n) - 1, so 0:(n - 1)

#>  [1] 0 1 2 3 4 5 6 7 8 9

1:(n-1)

#> [1] 1 2 3 4 5 6 7 8 9

seq_len(n - 1)

#> [1] 1 2 3 4 5 6 7 8 9

1:0

#> [1] 1 0

seq_len(0)  ## prefer using seq_len(n) rather than 1:n (e.g. in for-loops)

#> integer(0)

seq_along(5:7)  ## a shortcut for seq_len(length(.))

#> [1] 1 2 3

3.2 R base objects

3.2.1 Types

There are several “atomic” types of data: logical, integer, double and character (in this order, see below). There are also raw and complex, but they are rarely used.

You cannot mix types in an atomic vector, but you can in a list. Coercion will automatically occur when you mix types in a vector:

(a <- FALSE)

#> [1] FALSE

typeof(a)

#> [1] "logical"

(b <- 1:10)

#>  [1]  1  2  3  4  5  6  7  8  9 10

typeof(b)

#> [1] "integer"

c(a, b)  ## FALSE is coerced to an integer -> 0

#>  [1]  0  1  2  3  4  5  6  7  8  9 10

(c <- 10.5)

#> [1] 10.5

typeof(c)

#> [1] "double"

(d <- c(b, c))  ## coerced to numeric

#>  [1]  1.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0 10.0 10.5

c(d, "a")  ## coerced to character

#>  [1] "1"    "2"    "3"    "4"    "5"    "6"    "7"    "8"    "9"    "10"   "10.5" "a"

c(list(1), "a")

#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] "a"

50 < "7"  ## does "50" < "7"

#> [1] TRUE

3.2.2 Exercise

Use the automatic type coercion to convert this boolean matrix to a numeric one (with 0s and 1s). [What do you need to change in your code to get an integer matrix instead of a numeric one?]

(mat <- matrix(sample(c(TRUE, FALSE), 12, replace = TRUE), nrow = 3))

#>      [,1]  [,2]  [,3]  [,4]
#> [1,] TRUE  TRUE FALSE  TRUE
#> [2,] TRUE FALSE FALSE FALSE
#> [3,] TRUE FALSE FALSE FALSE

3.3 Base objects and accessors

3.3.1 Objects

“atomic” vector: vector of one base type (see above).
scalar: this doesn’t exist, this is a vector of length 1.
matrices / arrays: a vector with some dimensions (attribute).

(vec <- 1:12)

#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12

dim(vec) <- c(3, 4)
vec

#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12

class(vec)

#> [1] "matrix" "array"

dim(vec) <- c(3, 2, 2)
vec

#> , , 1
#> 
#>      [,1] [,2]
#> [1,]    1    4
#> [2,]    2    5
#> [3,]    3    6
#> 
#> , , 2
#> 
#>      [,1] [,2]
#> [1,]    7   10
#> [2,]    8   11
#> [3,]    9   12

class(vec)

#> [1] "array"

list: vector of elements with possibly different types in it.
data.frame: a list whose elements have the same lengths, and formatted somewhat as a matrix.

head(iris)

#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa

dim(iris)

#> [1] 150   5

length(iris)  ## a data.frame is also a list

#> [1] 5

3.3.2 Accessors

The [ accessor is used to access a subset of the data with the same class.

(x <- 1:5)

#> [1] 1 2 3 4 5

x[2:3]

#> [1] 2 3

x[2:8]  ## /!\ no warning

#> [1]  2  3  4  5 NA NA NA

(y <- matrix(1:12, nrow = 3))

#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12

y[4:9]  ## a matrix is also a vector

#> [1] 4 5 6 7 8 9

(l <- list(a = 1, b = "I love R", c = matrix(1:6, nrow = 2)))

#> $a
#> [1] 1
#> 
#> $b
#> [1] "I love R"
#> 
#> $c
#>      [,1] [,2] [,3]
#> [1,]    1    3    5
#> [2,]    2    4    6

l[2:3]

#> $b
#> [1] "I love R"
#> 
#> $c
#>      [,1] [,2] [,3]
#> [1,]    1    3    5
#> [2,]    2    4    6

head(iris)

#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa

head(iris[3:4])

#>   Petal.Length Petal.Width
#> 1          1.4         0.2
#> 2          1.4         0.2
#> 3          1.3         0.2
#> 4          1.5         0.2
#> 5          1.4         0.2
#> 6          1.7         0.4

class(iris[5])

#> [1] "data.frame"

You can also use a logical and character vectors to index these objects.

(x <- 1:4)

#> [1] 1 2 3 4

x[c(FALSE, TRUE, FALSE, TRUE)]

#> [1] 2 4

x[c(FALSE, TRUE)]  ## logical vectors are recycled

#> [1] 2 4

head(iris[c("Petal.Length", "Species")])

#>   Petal.Length Species
#> 1          1.4  setosa
#> 2          1.4  setosa
#> 3          1.3  setosa
#> 4          1.5  setosa
#> 5          1.4  setosa
#> 6          1.7  setosa

The [[ accessor is used to access a single element.

(x <- 1:10)

#>  [1]  1  2  3  4  5  6  7  8  9 10

x[[3]]

#> [1] 3

l[[2]]

#> [1] "I love R"

iris[["Species"]]

#>  [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
#> [13] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
#> [25] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
#> [37] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
#> [49] setosa setosa
#>  [ reached getOption("max.print") -- omitted 100 entries ]
#> Levels: setosa versicolor virginica

Figure 3.1: Indexing lists in R. [Source: https://goo.gl/8UkcHq]

Beware partial matching

x <- list(aardvark = 1:5)
x$a

#> [1] 1 2 3 4 5

x[["a"]]

#> NULL

x[["a", exact = FALSE]]

#> [1] 1 2 3 4 5

Special use of the [ accessor for array-like data.

(mat <- matrix(1:12, 3))

#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12

mat[1, ]

#> [1]  1  4  7 10

mat[, 1:2]

#>      [,1] [,2]
#> [1,]    1    4
#> [2,]    2    5
#> [3,]    3    6

mat[1, 1:2]

#> [1] 1 4

mat[1, 1:2, drop = FALSE]

#>      [,1] [,2]
#> [1,]    1    4

(two_col_ind <- cbind(c(1, 3, 2), c(1, 4, 2)))

#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    3    4
#> [3,]    2    2

mat[two_col_ind]

#> [1]  1 12  5

mat[]

#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12

mat[] <- 2
mat

#>      [,1] [,2] [,3] [,4]
#> [1,]    2    2    2    2
#> [2,]    2    2    2    2
#> [3,]    2    2    2    2

If you use arrays with more than two dimensions, simply add an additional comma for every new dimension.

3.3.3 Exercises

Use the dimension attribute to make a function that computes the sums every n elements of a vector. In which order are matrix elements stored? [Which are the special cases that you should consider?]
```
advr38pkg::sum_every(1:10, 2)
```
```
#> [1]  3  7 11 15 19
```

Compute the means of every numeric columns of the iris dataset. Expected result:

#> Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
#>     5.843333     3.057333     3.758000     1.199333

Convert the following matrix to a vector by replacing (0, 0) -> 0; (0, 1) -> 1; (1, 1) -> 2; (1, 0) -> NA.

mat <- matrix(0, 10, 2); mat[c(5, 8, 9, 12, 15, 16, 17, 19)] <- 1; mat

#>       [,1] [,2]
#>  [1,]    0    0
#>  [2,]    0    1
#>  [3,]    0    0
#>  [4,]    0    0
#>  [5,]    1    1
#>  [6,]    0    1
#>  [7,]    0    1
#>  [8,]    1    0
#>  [9,]    1    1
#> [10,]    0    0

by using this matrix:

(decode <- matrix(c(0, NA, 1, 2), 2))

#>      [,1] [,2]
#> [1,]    0    1
#> [2,]   NA    2

Start by doing it for one row, then by using apply(), finally replace it by a special accessor; what is the benefit?

Expected result:

#>  [1]  0  1  0  0  2  1  1 NA  2  0

3.4 Useful R base functions

In this section, I present some useful R base functions (also see this comprehensive list in French and this one in English):

3.4.1 General

# To get some help
?topic

# Run code from the example section
example(sum)

# Structure overview
str(iris)  ## skimr::skim(iris) is also very useful

#> 'data.frame':    150 obs. of  5 variables:
#>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#>  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

# List objects in the environment
ls()

#>  [1] "a"           "b"           "c"           "d"           "decode"      "l"          
#>  [7] "mat"         "n"           "osef"        "two_col_ind" "vec"         "x"          
#> [13] "y"

# Remove objects from the environment
rm(list = ls())  ## remove all objects in the global environment

# For a particular method, list available implementations for different classes 
methods(summary)

#>  [1] summary.aov                         summary.aovlist*                   
#>  [3] summary.aspell*                     summary.check_packages_in_dir*     
#>  [5] summary.connection                  summary.data.frame                 
#>  [7] summary.Date                        summary.default                    
#>  [9] summary.ecdf*                       summary.factor                     
#> [11] summary.glm                         summary.infl*                      
#> [13] summary.lm                          summary.loess*                     
#> [15] summary.manova                      summary.matrix                     
#> [17] summary.mlm*                        summary.nls*                       
#> [19] summary.packageStatus*              summary.POSIXct                    
#> [21] summary.POSIXlt                     summary.ppr*                       
#> [23] summary.prcomp*                     summary.princomp*                  
#> [25] summary.proc_time                   summary.rlang:::list_of_conditions*
#> [27] summary.rlang_error*                summary.rlang_message*             
#> [29] summary.rlang_trace*                summary.rlang_warning*             
#> [31] summary.srcfile                     summary.srcref                     
#> [33] summary.stepfun                     summary.stl*                       
#> [35] summary.table                       summary.tukeysmooth*               
#> [37] summary.vctrs_sclr*                 summary.vctrs_vctr*                
#> [39] summary.warnings                   
#> see '?methods' for accessing help and source code

# List methods available for a particular class
methods(class = "lm")

#>  [1] add1           alias          anova          case.names     coerce        
#>  [6] confint        cooks.distance deviance       dfbeta         dfbetas       
#> [11] drop1          dummy.coef     effects        extractAIC     family        
#> [16] formula        hatvalues      influence      initialize     kappa         
#> [21] labels         logLik         model.frame    model.matrix   nobs          
#> [26] plot           predict        print          proj           qr            
#> [31] residuals      rstandard      rstudent       show           simulate      
#> [36] slotsFromS3    summary        variable.names vcov          
#> see '?methods' for accessing help and source code

# Call a function with arguments as a list
(list_of_int <- as.list(1:5))

#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] 2
#> 
#> [[3]]
#> [1] 3
#> 
#> [[4]]
#> [1] 4
#> 
#> [[5]]
#> [1] 5

do.call('c', list_of_int)

#> [1] 1 2 3 4 5

3.4.2 Sequence and vector operations

1:10  ## of type integer

#>  [1]  1  2  3  4  5  6  7  8  9 10

seq(1, 10, by = 2)  ## of type double

#> [1] 1 3 5 7 9

seq(1, 100, length.out = 10)

#>  [1]   1  12  23  34  45  56  67  78  89 100

seq_len(5)

#> [1] 1 2 3 4 5

seq_along(21:24)

#> [1] 1 2 3 4

rep(1:4, 2)

#> [1] 1 2 3 4 1 2 3 4

rep(1:4, each = 2)

#> [1] 1 1 2 2 3 3 4 4

rep(1:4, 4:1)

#>  [1] 1 1 1 1 2 2 2 3 3 4

rep_len(1:3, 8)

#> [1] 1 2 3 1 2 3 1 2

replicate(5, rnorm(10))  ## How to use a multiline expression?

#>             [,1]         [,2]       [,3]        [,4]       [,5]
#>  [1,] -0.6341571  1.125124388 -0.9919725  0.34284644  0.8250381
#>  [2,] -1.6550308  0.968522076  0.5723411 -0.30374360  1.6437909
#>  [3,]  0.2958799 -1.750949367 -0.7610351  0.71556801  1.5508268
#>  [4,] -0.1550491 -0.008962213 -0.5345093  1.59411062 -1.1424409
#>  [5,] -0.1228469 -2.287465200 -0.1149587  0.35614636 -0.2287272
#>  [6,] -1.2682595 -1.730174725  0.1711193  0.05353415 -0.3090971
#>  [7,] -0.3868314 -0.766938647  0.1402920 -1.43115226 -1.3223686
#>  [8,] -0.4799821  0.561404748  0.1152301 -0.02839102 -0.6439932
#>  [9,]  0.2878766  0.051715293  0.2239831  1.69478626 -0.4033820
#> [10,] -0.8004772 -0.531994534  0.1606077  0.58350819 -0.4937378

sort(c(1, 6, 8, 2, 2))

#> [1] 1 2 2 6 8

order(c(1, 6, 8, 2, 2), c(0, 0, 0, 2, 1))

#> [1] 1 5 4 2 3

rank(c(1, 6, 8, 2, 2))

#> [1] 1.0 4.0 5.0 2.5 2.5

rank(c(1, 6, 8, 2, 2), ties.method = "first")

#> [1] 1 4 5 2 3

sort(c("a1", "a2", "a10"))

#> [1] "a1"  "a10" "a2"

gtools::mixedsort(c("a1", "a2", "a10"))  ## not in base, but useful

#> [1] "a1"  "a2"  "a10"

which.max(c(1, 5, 3, 6, 2, 0))

#> [1] 4

which.min(c(1, 5, 3, 6, 2, 0))

#> [1] 6

unique(c(1, NA, 2, 3, 2, NA, 3))

#> [1]  1 NA  2  3

table(rep(1:4, 4:1))

#> 
#> 1 2 3 4 
#> 4 3 2 1

table(A = c(1, 1, 1, 2, 2), B = c(1, 2, 1, 2, 1))

#>    B
#> A   1 2
#>   1 2 1
#>   2 1 1

sample(10)

#>  [1]  2  9  6 10  7  3  4  1  5  8

sample(3:10, 5)

#> [1] 5 3 7 8 9

sample(3:10, 50, replace = TRUE)

#>  [1]  5  8  6  5  7  4  7  6  8  9  9  4  6 10  8  4  8  7  7  7  6  7  9  5  4  6  4  6
#> [29]  8 10  9  5  5  9  6  3  8  5  8  5 10  7  3  3  5  3  7 10  4  6

round(x <- runif(10, max = 100))  ## 10 random numbers between 0 and 100

#>  [1] 30 17 64 82  3 17 91 88 40 81

round(x, digits = 2)

#>  [1] 30.25 17.18 63.80 81.96  3.21 17.23 90.78 87.81 40.31 81.30

round(x, -1)

#>  [1] 30 20 60 80  0 20 90 90 40 80

pmin(1:4, 4:1)

#> [1] 1 2 2 1

pmax(1:4, 4:1)

#> [1] 4 3 3 4

outer(1:4, 1:3, '+')

#>      [,1] [,2] [,3]
#> [1,]    2    3    4
#> [2,]    3    4    5
#> [3,]    4    5    6
#> [4,]    5    6    7

expand.grid(param1 = c(5, 50), param2 = c(1, 3, 10))

#>   param1 param2
#> 1      5      1
#> 2     50      1
#> 3      5      3
#> 4     50      3
#> 5      5     10
#> 6     50     10

Also see this nice Q/A on grouping functions and the *apply family and this book chapter about looping.

3.4.3 Character operations

paste("I", "am", "me")

#> [1] "I am me"

paste0("test", 0)

#> [1] "test0"

paste0("PC", 1:10)

#>  [1] "PC1"  "PC2"  "PC3"  "PC4"  "PC5"  "PC6"  "PC7"  "PC8"  "PC9"  "PC10"

me <- "Florian"
glue::glue("I am {me}")  ## not in base, but so useful

#> I am Florian

(x <- list.files(pattern = "\\.Rmd$", full.names = TRUE))

#> [1] "./good-practices.Rmd"       "./index.Rmd"                "./intro.Rmd"               
#> [4] "./packages.Rmd"             "./performance.Rmd"          "./presentation_project.Rmd"
#> [7] "./rprog.Rmd"                "./shiny.Rmd"                "./tidyverse.Rmd"

sub("\\.Rmd$", ".pdf", x)

#> [1] "./good-practices.pdf"       "./index.pdf"                "./intro.pdf"               
#> [4] "./packages.pdf"             "./performance.pdf"          "./presentation_project.pdf"
#> [7] "./rprog.pdf"                "./shiny.pdf"                "./tidyverse.pdf"

(y <- sample(letters[1:4], 10, replace = TRUE))

#>  [1] "d" "a" "b" "d" "a" "a" "a" "d" "b" "b"

match(y, letters[1:4])

#>  [1] 4 1 2 4 1 1 1 4 2 2

y %in% letters[1:2]

#>  [1] FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE

split(1:12, rep(letters[1:3], 4))

#> $a
#> [1]  1  4  7 10
#> 
#> $b
#> [1]  2  5  8 11
#> 
#> $c
#> [1]  3  6  9 12

intersect(letters[1:4], letters[3:5])

#> [1] "c" "d"

union(letters[1:4], letters[3:5])

#> [1] "a" "b" "c" "d" "e"

setdiff(letters[1:4], letters[3:5])

#> [1] "a" "b"

3.4.4 Logical operators

TRUE | stop("will go there")

#> Error in eval(expr, envir, enclos): will go there

TRUE || stop("won't go there")  ## won't evaluate second condition if first one is TRUE

#> [1] TRUE

c(TRUE, FALSE, TRUE, TRUE) & c(FALSE, TRUE, TRUE, FALSE)

#> [1] FALSE FALSE  TRUE FALSE

c(TRUE, FALSE, TRUE, TRUE) && c(FALSE, TRUE, TRUE, FALSE)  ## /!\ no warning in prior R versions

#> Warning in c(TRUE, FALSE, TRUE, TRUE) && c(FALSE, TRUE, TRUE, FALSE): 'length(x) = 4 > 1'
#> in coercion to 'logical(1)'

#> Warning in c(TRUE, FALSE, TRUE, TRUE) && c(FALSE, TRUE, TRUE, FALSE): 'length(x) = 4 > 1'
#> in coercion to 'logical(1)'

#> [1] FALSE

(x <- rnorm(10))

#>  [1] -0.222843272 -0.002146418  1.878748742 -0.141587374 -2.946771396  0.210897602
#>  [7]  0.780448114 -1.061463888 -0.797764227 -0.978253062

ifelse(x > 0, x, -x)  # try to find two other equivalents

#>  [1] 0.222843272 0.002146418 1.878748742 0.141587374 2.946771396 0.210897602 0.780448114
#>  [8] 1.061463888 0.797764227 0.978253062

Beware with ifelse() (learn more there), for example

ifelse(FALSE, 0, 1:5)

#> [1] 1

`if`(FALSE, 0, 1:5)

#> [1] 1 2 3 4 5

if (FALSE) 0 else 1:5

#> [1] 1 2 3 4 5

3.4.5 Exercises

Use sample(), rep_len() and split() to make a function that randomly splits some indices in a list of K groups of indices (like for cross-validation). [Which are the special cases that you should consider?]

advr38pkg::split_ind(1:40, 3)

#> $`1`
#>  [1]  1  3  7  9 10 13 15 22 25 26 29 31 33 35
#> 
#> $`2`
#>  [1]  2  4  8 12 14 16 17 18 19 24 27 38 39
#> 
#> $`3`
#>  [1]  5  6 11 20 21 23 28 30 32 34 36 37 40

Use replicate() and sample() to get a 95% confidence interval (using bootstrapping) for the mean of the following vector:

set.seed(1)
(x <- rnorm(10))

#>  [1] -0.6264538  0.1836433 -0.8356286  1.5952808  0.3295078 -0.8204684  0.4874291
#>  [8]  0.7383247  0.5757814 -0.3053884

mean(x)

#> [1] 0.1322028

Expected output (approximately):

#>       2.5%      97.5% 
#> -0.3145143  0.5998608

Use match() and some special accessor to add a column “my_val” to this data my_mtcars by putting the corresponding value of the column specified in “my_col”. [Can your solution be used for any number of column names?]

my_mtcars <- mtcars[c("mpg", "hp")]
my_mtcars$my_col <- sample(c("mpg", "hp"), size = nrow(my_mtcars), replace = TRUE)
head(my_mtcars)

#>                    mpg  hp my_col
#> Mazda RX4         21.0 110    mpg
#> Mazda RX4 Wag     21.0 110    mpg
#> Datsun 710        22.8  93     hp
#> Hornet 4 Drive    21.4 110     hp
#> Hornet Sportabout 18.7 175    mpg
#> Valiant           18.1 105     hp

Expected result (head):

#>                    mpg  hp my_col my_val
#> Mazda RX4         21.0 110    mpg   21.0
#> Mazda RX4 Wag     21.0 110    mpg   21.0
#> Datsun 710        22.8  93     hp     93
#> Hornet 4 Drive    21.4 110     hp    110
#> Hornet Sportabout 18.7 175    mpg   18.7
#> Valiant           18.1 105     hp    105

In the following data frame (recall that a data frame is also a list), for the first 3 columns, replace letters by corresponding numbers based on the code:

df <- data.frame(
  id1 = c("a", "f", "a"),
  id2 = c("b", "e", "e"), 
  id3 = c("c", "d", "f"),
  inter = c(7.343, 2.454, 3.234),
  stringsAsFactors = FALSE
)
df

#>   id1 id2 id3 inter
#> 1   a   b   c 7.343
#> 2   f   e   d 2.454
#> 3   a   e   f 3.234

(code <- setNames(1:6, letters[1:6]))

#> a b c d e f 
#> 1 2 3 4 5 6

Expected result:

#>   id1 id2 id3 inter
#> 1   1   2   3 7.343
#> 2   6   5   4 2.454
#> 3   1   5   6 3.234

3.5 Environments and scoping

Lexical scoping determines where to look for values, not when to look for them. R looks for values when the function is run, not when it’s created. This means that the output of a function can be different depending on objects outside its environment:

h <- function() {
  x <- 10
  f <- function() {
    x + 1
  }
  f()
}

x <- 100
h()

#> [1] 11

Variable x is not defined inside f so R will look at the environment of f (where f was defined) and then at the parent environment, and so on. Here, the first x that is found has value 10.

Be aware that for functions, packages environments are checked last so that you can redefine functions without noticing.

c <- function(...) paste0(...)
c(1, 2, 3)

#> [1] "123"

base::c(1, 2, 3)  ## you need to explicit the package

#> [1] 1 2 3

rm(c)  ## remove the new function from the environment
c(1, 2, 3)

#> [1] 1 2 3

You can use the <<- operator to change the value of an object in an upper environment:

count1 <- 0
count2 <- 0
f <- function(i) {
  count1 <-  count1 + 1  ## will assign a new (temporary) count1 each time
  count2 <<- count2 + 1  ## will increment count2 on top
  i + 1
}
sapply(1:10, f)

#>  [1]  2  3  4  5  6  7  8  9 10 11

c(count1, count2)

#> [1]  0 10

Finally, how does ... work? Basically, you copy and paste what is put in ...:

f1 <- function(...) {
  list(...)
}
f1(a = 2, b = 3)

#> $a
#> [1] 2
#> 
#> $b
#> [1] 3

list(a = 2, b = 3)

#> $a
#> [1] 2
#> 
#> $b
#> [1] 3

Learn more about functions and scoping rules of R with the R Programming for Data Science book.

3.6 Attributes and classes

Attributes are metadata associated with an object. You can get/set the list of attributes with attributes() or one particular attribute with attr().

attributes(iris)

#> $names
#> [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"     
#> 
#> $class
#> [1] "data.frame"
#> 
#> $row.names
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
#> [29] 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#>  [ reached getOption("max.print") -- omitted 100 entries ]

class(iris)

#> [1] "data.frame"

attr(iris, "row.names")

#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
#> [29] 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#>  [ reached getOption("max.print") -- omitted 100 entries ]

You can use structure() to create an object and add some arbitrary attributes.

structure(1:10, my_fancy_attribute = "blabla")

#>  [1]  1  2  3  4  5  6  7  8  9 10
#> attr(,"my_fancy_attribute")
#> [1] "blabla"

There are also some attributes with specific accessor functions to get and set values. For example, use names(x), dim(x) and class(x) instead of attr(x, "names"), attr(x, "dim") and attr(x, "class").

class(mylm <- lm(Sepal.Length ~ ., data = iris))

#> [1] "lm"

I’ve just fitted a linear model in order to predict the sepal length variable of the iris dataset based on the other variables. Using lm() gets me an object of class lm. What are the methods I can use for this object?

methods(class = class(mylm))

#>  [1] add1           alias          anova          case.names     coerce        
#>  [6] confint        cooks.distance deviance       dfbeta         dfbetas       
#> [11] drop1          dummy.coef     effects        extractAIC     family        
#> [16] formula        hatvalues      influence      initialize     kappa         
#> [21] labels         logLik         model.frame    model.matrix   nobs          
#> [26] plot           predict        print          proj           qr            
#> [31] residuals      rstandard      rstudent       show           simulate      
#> [36] slotsFromS3    summary        variable.names vcov          
#> see '?methods' for accessing help and source code

summary(mylm)

#> 
#> Call:
#> lm(formula = Sepal.Length ~ ., data = iris)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.79424 -0.21874  0.00899  0.20255  0.73103 
#> 
#> Coefficients:
#>                   Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)        2.17127    0.27979   7.760 1.43e-12 ***
#> Sepal.Width        0.49589    0.08607   5.761 4.87e-08 ***
#> Petal.Length       0.82924    0.06853  12.101  < 2e-16 ***
#> Petal.Width       -0.31516    0.15120  -2.084  0.03889 *  
#> Speciesversicolor -0.72356    0.24017  -3.013  0.00306 ** 
#> Speciesvirginica  -1.02350    0.33373  -3.067  0.00258 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.3068 on 144 degrees of freedom
#> Multiple R-squared:  0.8673, Adjusted R-squared:  0.8627 
#> F-statistic: 188.3 on 5 and 144 DF,  p-value: < 2.2e-16

plot(mylm)

R has the easiest way to create a class and to use methods on objects of this class; it is called S3. If you want to know more about the other types of classes, see the Advanced R book.

agent007 <- list(first = "James", last = "Bond")
agent007

#> $first
#> [1] "James"
#> 
#> $last
#> [1] "Bond"

class(agent007) <- "Person"  ## "agent007" is now an object of class "Person"
# Just make a function called <method_name>.<class_name>()
print.Person <- function(x) {
  print(glue::glue("My name is {x$last}, {x$first} {x$last}."))
  invisible(x)
}

agent007

#> My name is Bond, James Bond.

# Constructor of class as simple function
Person <- function(first, last) {
  structure(list(first = first, last = last), class = "Person")
}
(me <- Person("Florian", "Privé"))

#> My name is Privé, Florian Privé.

An object can have many classes:

Worker <- function(first, last, job) {
  obj <- Person(first, last)
  obj$job <- job
  class(obj) <- c("Worker", class(obj))
  obj
}
print.Worker <- function(x) {
  print.Person(x) 
  print(glue::glue("I am a {x$job}."))
  invisible(x)
}

(worker_007 <- Worker("James", "Bond", "secret agent"))

#> My name is Bond, James Bond.
#> I am a secret agent.

(worker_me <- Worker("Florian", "Privé", "researcher"))

#> My name is Privé, Florian Privé.
#> I am a researcher.