Chapter 3 R programming

This chapter is about base R stuff that I find important and that is often overlooked or unknown to most R users.

Learn more with the Advanced R book.

3.1 Common mistakes

If you are using R and you think you’re in hell, this is a map for you.

– Patrick Burns

3.1.1 Equality

(0.1 + 0.2) == 0.3
#> [1] FALSE
print(c(0.1, 0.2, 0.3), digits = 20)
#> [1] 0.10000000000000001 0.20000000000000001 0.29999999999999999
all.equal(0.1 + 0.2, 0.3)  ## equality with some tolerance
#> [1] TRUE
all.equal(0.1 + 0.2, 0.4)
#> [1] "Mean relative difference: 0.3333333"
isTRUE(all.equal(0.1 + 0.2, 0.4))  ## if you want a boolean, use isTRUE()
#> [1] FALSE

3.1.2 Arguments

min(-1, 5, 118)
#> [1] -1
max(-1, 5, 118)
#> [1] 118
mean(-1, 5, 118)
#> [1] -1
median(-1, 5, 118)
#> [1] -1
args(max)
#> function (..., na.rm = FALSE) 
#> NULL
args(mean)
#> function (x, ...) 
#> NULL
args(median)
#> function (x, na.rm = FALSE, ...) 
#> NULL

Always use a vector if you’re not sure:

min(c(-1, 5, 118))
#> [1] -1
max(c(-1, 5, 118))
#> [1] 118
mean(c(-1, 5, 118))
#> [1] 40.66667
median(c(-1, 5, 118))
#> [1] 5

3.1.3 Others

sample(1:10)
#>  [1]  2  6  8  5  1  3  7  9 10  4
sample(10)
#>  [1]  4  5  7  6  9 10  1  8  3  2
sample(10.1)
#>  [1]  2  5  3  6  7  8 10  1  9  4
n <- 10
1:n-1  # is (1:n) - 1, so 0:(n - 1)
#>  [1] 0 1 2 3 4 5 6 7 8 9
1:(n-1)
#> [1] 1 2 3 4 5 6 7 8 9
seq_len(n - 1)
#> [1] 1 2 3 4 5 6 7 8 9
1:0
#> [1] 1 0
seq_len(0)  ## prefer using seq_len (e.g. in for-loops)
#> integer(0)

3.2 R base objects

3.2.1 Types

There are many “atomic” types of data: logical, integer, double and character (in this order, see below). There are also raw and complex but they are rarely used.

You can’t mix types in an atomic vector (you can in a list). Coercion will automatically occur if you mix types:

(a <- FALSE)
#> [1] FALSE
typeof(a)
#> [1] "logical"
(b <- 1:10)
#>  [1]  1  2  3  4  5  6  7  8  9 10
typeof(b)
#> [1] "integer"
c(a, b)  ## FALSE is coerced to integer 0
#>  [1]  0  1  2  3  4  5  6  7  8  9 10
(c <- 10.5)
#> [1] 10.5
typeof(c)
#> [1] "double"
(d <- c(b, c))  ## coerced to numeric
#>  [1]  1.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0 10.0 10.5
c(d, "a")  ## coerced to character
#>  [1] "1"    "2"    "3"    "4"    "5"    "6"    "7"    "8"    "9"    "10"   "10.5" "a"
c(list(1), "a")
#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] "a"
50 < "7"  ## "50" < "7"
#> [1] TRUE

3.2.2 Exercise

Use the automatic type coercion to convert this boolean matrix to a numeric one (with 0s and 1s). [What do you need to change to get an integer matrix instead?]

(mat <- matrix(sample(c(TRUE, FALSE), 12, replace = TRUE), 3))
#>      [,1]  [,2]  [,3]  [,4]
#> [1,] TRUE  TRUE FALSE FALSE
#> [2,] TRUE  TRUE FALSE  TRUE
#> [3,] TRUE FALSE  TRUE  TRUE

3.3 Base objects and accessors

3.3.1 Objects

  • “atomic” vector: vector of one base type (see above).

  • scalar: this doesn’t exist, this is a vector of length 1.

  • matrices / arrays: a vector with some dimensions (attribute).

(vec <- 1:12)
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12
dim(vec) <- c(3, 4)
vec
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
class(vec)
#> [1] "matrix"
dim(vec) <- c(3, 2, 2)
vec
#> , , 1
#> 
#>      [,1] [,2]
#> [1,]    1    4
#> [2,]    2    5
#> [3,]    3    6
#> 
#> , , 2
#> 
#>      [,1] [,2]
#> [1,]    7   10
#> [2,]    8   11
#> [3,]    9   12
class(vec)
#> [1] "array"
  • list: vector of elements with possibly different types in it.

  • data.frame: a list whose elements have the same lengths, and formatted somewhat as a matrix.

head(iris)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa
dim(iris)
#> [1] 150   5
length(iris)
#> [1] 5

3.3.2 Accessors

  1. The [ accessor is used to access a subset of the data with the same class.
(x <- runif(5))
#> [1] 0.44807130 0.62153792 0.21489806 0.04745351 0.52382861
x[2:3]
#> [1] 0.6215379 0.2148981
x[2:8]
#> [1] 0.62153792 0.21489806 0.04745351 0.52382861         NA         NA         NA
(y <- matrix(runif(12), 3))
#>           [,1]      [,2]      [,3]      [,4]
#> [1,] 0.3069655 0.2322659 0.6463609 0.3453824
#> [2,] 0.3750572 0.5349028 0.2443481 0.1290939
#> [3,] 0.9992631 0.2139430 0.6285527 0.5033093
y[4:9]  ## a matrix is a vector
#> [1] 0.2322659 0.5349028 0.2139430 0.6463609 0.2443481 0.6285527
(l <- list(a = 2:3, b = "toto", c = runif(10)))
#> $a
#> [1] 2 3
#> 
#> $b
#> [1] "toto"
#> 
#> $c
#>  [1] 0.7999544 0.1023930 0.5466600 0.7070417 0.4803148 0.1461758 0.5984381 0.5436870
#>  [9] 0.8746974 0.3394718
l[2:3]
#> $b
#> [1] "toto"
#> 
#> $c
#>  [1] 0.7999544 0.1023930 0.5466600 0.7070417 0.4803148 0.1461758 0.5984381 0.5436870
#>  [9] 0.8746974 0.3394718
head(iris)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa
head(iris[3:4])
#>   Petal.Length Petal.Width
#> 1          1.4         0.2
#> 2          1.4         0.2
#> 3          1.3         0.2
#> 4          1.5         0.2
#> 5          1.4         0.2
#> 6          1.7         0.4
class(iris[5])
#> [1] "data.frame"

You can also use a logical and character vectors to index these objects.

(x <- runif(4))
#> [1] 0.73691989 0.15665654 0.46709044 0.07159353
x[c(FALSE, TRUE, FALSE, TRUE)]
#> [1] 0.15665654 0.07159353
x[c(FALSE, TRUE)]  ## logical vectors are recycled
#> [1] 0.15665654 0.07159353
head(iris[c("Petal.Length", "Species")])
#>   Petal.Length Species
#> 1          1.4  setosa
#> 2          1.4  setosa
#> 3          1.3  setosa
#> 4          1.5  setosa
#> 5          1.4  setosa
#> 6          1.7  setosa
  1. The [[ accessor is used to access a single element.
(x <- 1:10)
#>  [1]  1  2  3  4  5  6  7  8  9 10
x[[3]]
#> [1] 3
l[[2]]
#> [1] "toto"
iris[["Species"]]
#>  [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
#> [13] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
#> [25] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
#> [37] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
#> [49] setosa setosa
#>  [ reached getOption("max.print") -- omitted 100 entries ]
#> Levels: setosa versicolor virginica
Indexing lists in R. [Source: https://goo.gl/8UkcHq]

Figure 3.1: Indexing lists in R. [Source: https://goo.gl/8UkcHq]

  1. Beware partial matching
x <- list(aardvark = 1:5)
x$a
#> [1] 1 2 3 4 5
x[["a"]]
#> NULL
x[["a", exact = FALSE]]
#> [1] 1 2 3 4 5
  1. Special use of the [ accessor for array-like data.
(mat <- matrix(1:12, 3))
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
mat[1, ]
#> [1]  1  4  7 10
mat[, 1:2]
#>      [,1] [,2]
#> [1,]    1    4
#> [2,]    2    5
#> [3,]    3    6
mat[1, 1:2]
#> [1] 1 4
mat[1, 1:2, drop = FALSE]
#>      [,1] [,2]
#> [1,]    1    4
(two_col_ind <- cbind(c(1, 3, 2), c(1, 4, 2)))
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    3    4
#> [3,]    2    2
mat[two_col_ind]
#> [1]  1 12  5
mat[,]
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
mat[]
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
mat[] <- 2
mat
#>      [,1] [,2] [,3] [,4]
#> [1,]    2    2    2    2
#> [2,]    2    2    2    2
#> [3,]    2    2    2    2

If you use arrays with more than two dimensions, just add a comma for every new dimension.

3.3.3 Exercises

  1. Use the dimension attribute to make a function that computes the sums every n elements of a vector. In which order are matrix elements stored? [Which are the special cases that you should consider?]

    advr38pkg::sum_every(1:10, 2)
    #> [1]  3  7 11 15 19
  2. Compute the means of every numeric columns of the iris dataset. Expected result:

    #> Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    #>     5.843333     3.057333     3.758000     1.199333
  3. Convert this matrix to a vector by replacing c(0,0) -> 0; c(0,1) -> 1; c(1,1) -> 2; c(1,0) -> NA

    mat <- matrix(0, 10, 2); mat[c(5, 8, 9, 12, 15, 16, 17, 19)] <- 1; mat
    #>       [,1] [,2]
    #>  [1,]    0    0
    #>  [2,]    0    1
    #>  [3,]    0    0
    #>  [4,]    0    0
    #>  [5,]    1    1
    #>  [6,]    0    1
    #>  [7,]    0    1
    #>  [8,]    1    0
    #>  [9,]    1    1
    #> [10,]    0    0

    by using this matrix:

    (decode <- matrix(c(0, NA, 1, 2), 2))
    #>      [,1] [,2]
    #> [1,]    0    1
    #> [2,]   NA    2

    Begin by using apply() and then replace it by a special accessor; what is the benefit?

    Expected result:

    #>  [1]  0  1  0  0  2  1  1 NA  2  0

3.4 Useful R base functions

In this section, I present some useful R base functions (also see this comprehensive list in French and this one in English):

3.4.1 General

# To get some help
?topic

# Run code from the example section
example(sum)
# Structure overview
str(iris)
#> 'data.frame':    150 obs. of  5 variables:
#>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#>  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# List objects of environment
ls()
#>  [1] "a"           "b"           "c"           "d"           "decode"      "l"          
#>  [7] "mat"         "n"           "osef"        "two_col_ind" "vec"         "x"          
#> [13] "y"
# Remove objects from the environment
rm(list = ls())  ## remove all objects in the environment
# list of available S3 methods
methods(summary)
#>  [1] summary.aov                    summary.aovlist*              
#>  [3] summary.aspell*                summary.check_packages_in_dir*
#>  [5] summary.connection             summary.data.frame            
#>  [7] summary.Date                   summary.default               
#>  [9] summary.ecdf*                  summary.factor                
#> [11] summary.glm                    summary.infl*                 
#> [13] summary.lm                     summary.loess*                
#> [15] summary.manova                 summary.matrix                
#> [17] summary.mlm*                   summary.nls*                  
#> [19] summary.packageStatus*         summary.POSIXct               
#> [21] summary.POSIXlt                summary.ppr*                  
#> [23] summary.prcomp*                summary.princomp*             
#> [25] summary.proc_time              summary.rlang_error*          
#> [27] summary.rlang_trace*           summary.srcfile               
#> [29] summary.srcref                 summary.stepfun               
#> [31] summary.stl*                   summary.table                 
#> [33] summary.tukeysmooth*           summary.warnings              
#> see '?methods' for accessing help and source code
methods(class = "lm")
#>  [1] add1           alias          anova          case.names     coerce        
#>  [6] confint        cooks.distance deviance       dfbeta         dfbetas       
#> [11] drop1          dummy.coef     effects        extractAIC     family        
#> [16] formula        hatvalues      influence      initialize     kappa         
#> [21] labels         logLik         model.frame    model.matrix   nobs          
#> [26] plot           predict        print          proj           qr            
#> [31] residuals      rstandard      rstudent       show           simulate      
#> [36] slotsFromS3    summary        variable.names vcov          
#> see '?methods' for accessing help and source code
# Call a function with arguments as a list
(list_of_int <- lapply(1:5, function(i) i))
#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] 2
#> 
#> [[3]]
#> [1] 3
#> 
#> [[4]]
#> [1] 4
#> 
#> [[5]]
#> [1] 5
do.call('c', list_of_int)
#> [1] 1 2 3 4 5

3.4.2 Sequence and vector operations

1:10  ## of type integer
#>  [1]  1  2  3  4  5  6  7  8  9 10
seq(1, 10, by = 2)  ## of type double
#> [1] 1 3 5 7 9
seq(1, 100, length.out = 10)
#>  [1]   1  12  23  34  45  56  67  78  89 100
seq_len(5)
#> [1] 1 2 3 4 5
seq_along(21:24)
#> [1] 1 2 3 4
rep(1:4, 2)
#> [1] 1 2 3 4 1 2 3 4
rep(1:4, each = 2)
#> [1] 1 1 2 2 3 3 4 4
rep(1:4, 4:1)
#>  [1] 1 1 1 1 2 2 2 3 3 4
rep_len(1:3, 8)
#> [1] 1 2 3 1 2 3 1 2
replicate(5, rnorm(10))  ## How to use a multiline expression?
#>              [,1]         [,2]       [,3]        [,4]        [,5]
#>  [1,]  0.93846460  0.084813875  2.0453056 -1.21374614 -0.52956126
#>  [2,]  0.21363856  0.472690627 -1.1620982 -0.07644218  1.20930404
#>  [3,]  0.53807914  2.382310528  0.4647532  0.60873265 -1.33783998
#>  [4,] -0.64570483 -0.008324880  1.7755535  0.29781600 -0.02224145
#>  [5,]  0.05012369 -0.072429278 -1.5604312  0.10614859 -0.20997969
#>  [6,]  0.56267527  0.396873009  0.2417296  0.01299187  0.77520605
#>  [7,] -1.21505483 -0.002609421 -0.3970332  0.61041314 -2.78869266
#>  [8,] -1.81045026  0.286111450 -0.8375245  0.73542016 -0.19036000
#>  [9,]  0.56035322  0.022867584 -0.7482098 -0.73185847 -1.64131419
#> [10,]  1.20286080  1.323816373  0.6813290 -0.28415745 -1.02107054
sort(c(1, 6, 8, 2, 2))
#> [1] 1 2 2 6 8
order(c(1, 6, 8, 2, 2), c(0, 0, 0, 2, 1))
#> [1] 1 5 4 2 3
rank(c(1, 6, 8, 2, 2))
#> [1] 1.0 4.0 5.0 2.5 2.5
rank(c(1, 6, 8, 2, 2), ties.method = "first")
#> [1] 1 4 5 2 3
sort(c("a1", "a2", "a10"))
#> [1] "a1"  "a10" "a2"
gtools::mixedsort(c("a1", "a2", "a10"))  ## not in base, obviously
#> [1] "a1"  "a2"  "a10"
which.max(c(1, 5, 3, 6, 2, 0))
#> [1] 4
which.min(c(1, 5, 3, 6, 2, 0))
#> [1] 6
unique(c(1, NA, 2, 3, 2, NA, 3))
#> [1]  1 NA  2  3
table(rep(1:4, 4:1))
#> 
#> 1 2 3 4 
#> 4 3 2 1
table(A = c(1, 1, 1, 2, 2), B = c(1, 2, 1, 2, 1))
#>    B
#> A   1 2
#>   1 2 1
#>   2 1 1
sample(10)
#>  [1]  4  9  1  6  2  5 10  8  3  7
sample(3:10, 5)
#> [1]  6 10  7  5  9
sample(3:10, 50, replace = TRUE)
#>  [1]  7  9  5  9  8  7  6  4  8  8  3  9  9 10  6  9  5  7  5 10  3  9  3  3  3  4  6  5
#> [29]  7  5  9  9  7  9  7 10  4  7  8  5  5  6  6  4  7  9  3  4  6  6
round(x <- runif(10, max = 100))
#>  [1] 73 56 48  9 52 59  0  3 19 47
round(x, digits = 2)
#>  [1] 72.75 56.10 48.14  9.23 52.13 58.72  0.11  3.03 18.96 46.82
round(x, -1)
#>  [1] 70 60 50 10 50 60  0  0 20 50
pmin(1:4, 4:1)
#> [1] 1 2 2 1
pmax(1:4, 4:1)
#> [1] 4 3 3 4
outer(1:4, 1:3, '+')
#>      [,1] [,2] [,3]
#> [1,]    2    3    4
#> [2,]    3    4    5
#> [3,]    4    5    6
#> [4,]    5    6    7
expand.grid(param1 = c(5, 50), param2 = c(1, 3, 10))
#>   param1 param2
#> 1      5      1
#> 2     50      1
#> 3      5      3
#> 4     50      3
#> 5      5     10
#> 6     50     10

Also see this nice Q/A on grouping functions and the *apply family and this book chapter about looping.

3.4.3 Character operations

paste("I", "am", "me")
#> [1] "I am me"
paste0("test", 0)
#> [1] "test0"
paste0("PC", 1:10)
#>  [1] "PC1"  "PC2"  "PC3"  "PC4"  "PC5"  "PC6"  "PC7"  "PC8"  "PC9"  "PC10"
me <- "Florian"
glue::glue("I am {me}")  ## not in base, but SO useful
#> I am Florian
(x <- list.files(pattern = "\\.Rmd$", full.names = TRUE))
#> [1] "./good-practices.Rmd" "./index.Rmd"          "./intro.Rmd"         
#> [4] "./packages.Rmd"       "./performance.Rmd"    "./rprog.Rmd"         
#> [7] "./shiny.Rmd"          "./tidyverse.Rmd"
sub("\\.Rmd$", ".pdf", x)
#> [1] "./good-practices.pdf" "./index.pdf"          "./intro.pdf"         
#> [4] "./packages.pdf"       "./performance.pdf"    "./rprog.pdf"         
#> [7] "./shiny.pdf"          "./tidyverse.pdf"
(y <- sample(letters[1:4], 10, replace = TRUE))
#>  [1] "c" "d" "d" "d" "a" "a" "d" "d" "d" "d"
match(y, letters[1:4])
#>  [1] 3 4 4 4 1 1 4 4 4 4
y %in% letters[1:2]
#>  [1] FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
rle(rep(1:4, 4:1))
#> Run Length Encoding
#>   lengths: int [1:4] 4 3 2 1
#>   values : int [1:4] 1 2 3 4
split(1:12, rep(letters[1:3], 4))
#> $a
#> [1]  1  4  7 10
#> 
#> $b
#> [1]  2  5  8 11
#> 
#> $c
#> [1]  3  6  9 12
intersect(letters[1:4], letters[3:5])
#> [1] "c" "d"
union(letters[1:4], letters[3:5])
#> [1] "a" "b" "c" "d" "e"
setdiff(letters[1:4], letters[3:5])
#> [1] "a" "b"

3.4.4 Logical operators

TRUE | stop("will go there")
#> Error in eval(expr, envir, enclos): will go there
TRUE || stop("won't go there")  ## won't evaluate second condition if first one is TRUE
#> [1] TRUE
c(TRUE, FALSE, TRUE, TRUE) & c(FALSE, TRUE, TRUE, FALSE) 
#> [1] FALSE FALSE  TRUE FALSE
c(TRUE, FALSE, TRUE, TRUE) && c(FALSE, TRUE, TRUE, FALSE)  ## no warning!!
#> [1] FALSE
(x <- rnorm(10))
#>  [1]  1.5824059  1.1817680  0.1608777  0.2469118 -0.6086606 -0.7466633 -1.0254717
#>  [8] -0.6333350 -0.3989180  1.3663350
ifelse(x > 0, x, -x)
#>  [1] 1.5824059 1.1817680 0.1608777 0.2469118 0.6086606 0.7466633 1.0254717 0.6333350
#>  [9] 0.3989180 1.3663350

Beware with ifelse() (learn more there), for example

ifelse(FALSE, 0, 1:5)
#> [1] 1
`if`(FALSE, 0, 1:5)
#> [1] 1 2 3 4 5
if (FALSE) 0 else 1:5
#> [1] 1 2 3 4 5

3.4.5 Exercises

  1. Use sample(), rep_len() and split() to make a function that randomly splits some indices in a list of K groups of indices (like for cross-validation). [Which are the special cases that you should consider?]

    advr38pkg::split_ind(1:40, 3)
    #> $`1`
    #>  [1]  4  6  7 12 17 18 22 23 27 29 30 31 33 35
    #> 
    #> $`2`
    #>  [1]  2  5  9 10 11 14 15 24 26 28 38 39 40
    #> 
    #> $`3`
    #>  [1]  1  3  8 13 16 19 20 21 25 32 34 36 37
  2. Use replicate() and sample() to get a 95% confidence interval (using bootstrapping) for the mean of the following vector:

    set.seed(1)
    (x <- rnorm(10))
    #>  [1] -0.6264538  0.1836433 -0.8356286  1.5952808  0.3295078 -0.8204684  0.4874291
    #>  [8]  0.7383247  0.5757814 -0.3053884
    mean(x)
    #> [1] 0.1322028

    Expected output (approximately):

    #>       2.5%      97.5% 
    #> -0.3145143  0.5998608
  3. Use match() and some special accessor to add a column “my_val” to this data my_mtcars by putting the corresponding value of the column specified in “my_col.” [Can your solution be used for any number of column names?]

    my_mtcars <- mtcars[c("mpg", "hp")]
    my_mtcars$my_col <- sample(c("mpg", "hp"), size = nrow(my_mtcars), replace = TRUE)
    head(my_mtcars)
    #>                    mpg  hp my_col
    #> Mazda RX4         21.0 110    mpg
    #> Mazda RX4 Wag     21.0 110    mpg
    #> Datsun 710        22.8  93     hp
    #> Hornet 4 Drive    21.4 110     hp
    #> Hornet Sportabout 18.7 175    mpg
    #> Valiant           18.1 105     hp

    Expected result (head):

    #>                    mpg  hp my_col my_val
    #> Mazda RX4         21.0 110    mpg   21.0
    #> Mazda RX4 Wag     21.0 110    mpg   21.0
    #> Datsun 710        22.8  93     hp     93
    #> Hornet 4 Drive    21.4 110     hp    110
    #> Hornet Sportabout 18.7 175    mpg   18.7
    #> Valiant           18.1 105     hp    105
  4. In the following data frame (recall that a data frame is also a list), for the first 3 columns, replace letters by corresponding numbers based on the code:

    df <- data.frame(
      id1 = c("a", "f", "a"),
      id2 = c("b", "e", "e"), 
      id3 = c("c", "d", "f"),
      inter = c(7.343, 2.454, 3.234),
      stringsAsFactors = FALSE
    )
    df
    #>   id1 id2 id3 inter
    #> 1   a   b   c 7.343
    #> 2   f   e   d 2.454
    #> 3   a   e   f 3.234
    (code <- setNames(1:6, letters[1:6]))
    #> a b c d e f 
    #> 1 2 3 4 5 6

    Expected result:

    #>   id1 id2 id3 inter
    #> 1   1   2   3 7.343
    #> 2   6   5   4 2.454
    #> 3   1   5   6 3.234

3.5 Environments and scoping

Lexical scoping determines where to look for values, not when to look for them. R looks for values when the function is run, not when it’s created. This means that the output of a function can be different depending on objects outside its environment:

h <- function() {
  x <- 10
  f <- function() {
    x + 1
  }
  f()
}
x <- 100
h()
#> [1] 11

Variable x is not defined inside f so R will look at the environment of f (where f was defined) and then at the parent environment, and so on. Here, the first x that is found has value 10.

Be aware that for functions, packages environments are checked last so that you can redefine functions without noticing.

c <- function(...) paste0(...)
c(1, 2, 3)
#> [1] "123"
base::c(1, 2, 3)  ## you need to explicit the package
#> [1] 1 2 3
rm(c)  ## remove the new function from the environment
c(1, 2, 3)
#> [1] 1 2 3

You can use the <<- operator to change the value of an object in an upper environment:

count1 <- 0
count2 <- 0
f <- function(i) {
  count1 <-  count1 + 1  ## will assign a new (temporary) count1
  count2 <<- count2 + 1  ## will assign count2 on top
  i + 1
}
sapply(1:10, f)
#>  [1]  2  3  4  5  6  7  8  9 10 11
c(count1, count2)
#> [1]  0 10

Finally, how works the ...? Basically, you copy and paste what is put in ...:

f1 <- function(...) {
  list(...)
}
f1(a = 2, b = 3)
#> $a
#> [1] 2
#> 
#> $b
#> [1] 3
list(a = 2, b = 3)
#> $a
#> [1] 2
#> 
#> $b
#> [1] 3

Learn more about functions and scoping rules of R with the R Programming for Data Science book.

3.6 Attributes and classes

Attributes are metadata associated with an object. You can get/set the list of attributes with attributes() or one particular attribute with attr().

attributes(iris)
#> $names
#> [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"     
#> 
#> $class
#> [1] "data.frame"
#> 
#> $row.names
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
#> [29] 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#>  [ reached getOption("max.print") -- omitted 100 entries ]
class(iris)
#> [1] "data.frame"
attr(iris, "row.names")
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
#> [29] 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
#>  [ reached getOption("max.print") -- omitted 100 entries ]

You can use structure() to create an object and add some arbitrary attributes.

structure(1:10, my_fancy_attribute = "blabla")
#>  [1]  1  2  3  4  5  6  7  8  9 10
#> attr(,"my_fancy_attribute")
#> [1] "blabla"

There are also some attributes with specific accessor functions to get and set values. For example, use names(x), dim(x) and class(x) instead of attr(x, "names"), attr(x, "dim") and attr(x, "class").


class(mylm <- lm(Sepal.Length ~ ., data = iris))
#> [1] "lm"

I’ve just fitted a linear model in order to predict the sepal length variable of the iris dataset based on the other variables. Using lm() gets me an object of class lm. What are the methods I can use for this object?

methods(class = class(mylm))
#>  [1] add1           alias          anova          case.names     coerce        
#>  [6] confint        cooks.distance deviance       dfbeta         dfbetas       
#> [11] drop1          dummy.coef     effects        extractAIC     family        
#> [16] formula        hatvalues      influence      initialize     kappa         
#> [21] labels         logLik         model.frame    model.matrix   nobs          
#> [26] plot           predict        print          proj           qr            
#> [31] residuals      rstandard      rstudent       show           simulate      
#> [36] slotsFromS3    summary        variable.names vcov          
#> see '?methods' for accessing help and source code
summary(mylm)
#> 
#> Call:
#> lm(formula = Sepal.Length ~ ., data = iris)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.79424 -0.21874  0.00899  0.20255  0.73103 
#> 
#> Coefficients:
#>                   Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)        2.17127    0.27979   7.760 1.43e-12 ***
#> Sepal.Width        0.49589    0.08607   5.761 4.87e-08 ***
#> Petal.Length       0.82924    0.06853  12.101  < 2e-16 ***
#> Petal.Width       -0.31516    0.15120  -2.084  0.03889 *  
#> Speciesversicolor -0.72356    0.24017  -3.013  0.00306 ** 
#> Speciesvirginica  -1.02350    0.33373  -3.067  0.00258 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.3068 on 144 degrees of freedom
#> Multiple R-squared:  0.8673, Adjusted R-squared:  0.8627 
#> F-statistic: 188.3 on 5 and 144 DF,  p-value: < 2.2e-16
plot(mylm)


R has the easiest way to create a class and to use methods on objects of this class; it is called S3. If you want to know more about the other types of classes, see the Advanced R book.

agent007 <- list(first = "James", last = "Bond")
agent007
#> $first
#> [1] "James"
#> 
#> $last
#> [1] "Bond"
class(agent007) <- "Person"  ## "agent007" is now an object of class "Person"
# Just make a function called <method_name>.<class_name>()
print.Person <- function(x) {
  print(glue::glue("My name is {x$last}, {x$first} {x$last}."))
  invisible(x)
}

agent007
#> My name is Bond, James Bond.
# Constructor of class as simple function
Person <- function(first, last) {
  structure(list(first = first, last = last), class = "Person")
}
(me <- Person("Florian", "Privé"))
#> My name is Privé, Florian Privé.

An object can have many classes:

Worker <- function(first, last, job) {
  obj <- Person(first, last)
  obj$job <- job
  class(obj) <- c("Worker", class(obj))
  obj
}
print.Worker <- function(x) {
  print.Person(x) 
  print(glue::glue("I am a {x$job}."))
  invisible(x)
}

(worker_007 <- Worker("James", "Bond", "secret agent"))
#> My name is Bond, James Bond.
#> I am a secret agent.
(worker_me <- Worker("Florian", "Privé", "postdoc"))
#> My name is Privé, Florian Privé.
#> I am a postdoc.