Chapter 3 R programming

This chapter is about base R stuff that I find important and that are often overlooked or unknown to most R users.

Learn more with the Advanced R book.

3.1 Common mistakes

If you are using R and you think you’re in hell, this is a map for you.

– Patrick Burns

3.1.1 Equality

(0.1 + 0.2) == 0.3
#> [1] FALSE
print(c(0.1, 0.2, 0.3), digits = 20)
#> [1] 0.10000000000000000555 0.20000000000000001110 0.29999999999999998890
all.equal(0.1 + 0.2, 0.3)  ## equality with some tolerance
#> [1] TRUE
isTRUE(all.equal(0.1 + 0.2, 0.4))  ## if you want a boolean, use isTRUE()
#> [1] FALSE

3.1.2 Arguments

min(-1, 5, 118)
#> [1] -1
max(-1, 5, 118)
#> [1] 118
args(max)
#> function (..., na.rm = FALSE) 
#> NULL
mean(-1, 5, 118)
#> [1] -1
args(mean)
#> function (x, ...) 
#> NULL
median(-1, 5, 118)
#> [1] -1
args(median)
#> function (x, na.rm = FALSE, ...) 
#> NULL

Always use one vector if you’re not sure:

min(c(-1, 5, 118))
#> [1] -1
max(c(-1, 5, 118))
#> [1] 118
mean(c(-1, 5, 118))
#> [1] 40.66667
median(c(-1, 5, 118))
#> [1] 5

3.1.3 Others

sample(1:10)
#>  [1]  9 10  8  4  3  2  6  7  1  5
sample(10)
#>  [1]  6  2  9  5  8  7  4  1  3 10
sample(10.1)
#>  [1]  2  4  9  3  1  5 10  8  7  6
1:0
#> [1] 1 0
seq_len(0)  ## prefer using seq_len (e.g. in for-loops)
#> integer(0)
n <- 10
1:n+1  # is (1:n) + 1, so 2:(n + 1)
#>  [1]  2  3  4  5  6  7  8  9 10 11
1:(n+1)
#>  [1]  1  2  3  4  5  6  7  8  9 10 11
seq_len(n + 1)
#>  [1]  1  2  3  4  5  6  7  8  9 10 11

3.2 R base objects

3.2.1 Types

There are many “atomic” types of data: logical, integer, double and character (in this order, see below). There are also raw and complex but they are rarely used.

You can’t mix types in an atomic vector (you can in a list). Coercion will automatically occur if you mix types:

(a <- FALSE)
#> [1] FALSE
typeof(a)
#> [1] "logical"
(b <- 1:10)
#>  [1]  1  2  3  4  5  6  7  8  9 10
typeof(b)
#> [1] "integer"
c(a, b)  ## FALSE is coerced to integer 0
#>  [1]  0  1  2  3  4  5  6  7  8  9 10
(c <- 10.5)
#> [1] 10.5
typeof(c)
#> [1] "double"
(d <- c(b, c))  ## coerced to numeric
#>  [1]  1.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0  9.0 10.0 10.5
c(d, "a")  ## coerced to character
#>  [1] "1"    "2"    "3"    "4"    "5"    "6"    "7"    "8"    "9"    "10"   "10.5" "a"
c(list(1), "a")
#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] "a"
50 < "7"  ## "50" < "7"
#> [1] TRUE

3.2.2 Exercise

Use the automatic type coercion to convert this boolean matrix to a numeric one (with 0s and 1s). [What do you need to change to get an integer matrix instead?]

(mat <- matrix(sample(c(TRUE, FALSE), 12, replace = TRUE), 3))
#>       [,1]  [,2]  [,3] [,4]
#> [1,]  TRUE FALSE  TRUE TRUE
#> [2,]  TRUE  TRUE  TRUE TRUE
#> [3,] FALSE FALSE FALSE TRUE

3.3 Base objects and accessors

3.3.1 Objects

  • “atomic” vector: vector of one base type (see above).

  • scalar: this doesn’t exist, this is a vector of length 1.

  • matrices / arrays: a vector with some dimensions (attribute).

(vec <- 1:12)
#>  [1]  1  2  3  4  5  6  7  8  9 10 11 12
dim(vec) <- c(3, 4)
vec
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
class(vec)
#> [1] "matrix"
dim(vec) <- c(3, 2, 2)
vec
#> , , 1
#> 
#>      [,1] [,2]
#> [1,]    1    4
#> [2,]    2    5
#> [3,]    3    6
#> 
#> , , 2
#> 
#>      [,1] [,2]
#> [1,]    7   10
#> [2,]    8   11
#> [3,]    9   12
class(vec)
#> [1] "array"
  • list: vector of elements with possibly different types in it.

  • data.frame: a list whose elements have the same lengths, and formatted somewhat as a matrix.

3.3.2 Accessors

  1. The [ accessor is used to access a subset of the data with the same class.
(x <- runif(5))
#> [1] 0.3742428 0.1967849 0.9906526 0.3354000 0.7317043
x[2:3]
#> [1] 0.1967849 0.9906526
x[2:8]
#> [1] 0.1967849 0.9906526 0.3354000 0.7317043        NA        NA        NA
(y <- matrix(runif(12), 3))
#>           [,1]      [,2]       [,3]      [,4]
#> [1,] 0.7640510 0.9464476 0.02809067 0.8516801
#> [2,] 0.5257313 0.4339700 0.10169373 0.5641369
#> [3,] 0.9711716 0.7862585 0.99179505 0.5386078
y[4:9]  ## a matrix is a vector
#> [1] 0.94644762 0.43397003 0.78625854 0.02809067 0.10169373 0.99179505
(l <- list(a = 2:3, b = "toto", c = runif(10)))
#> $a
#> [1] 2 3
#> 
#> $b
#> [1] "toto"
#> 
#> $c
#>  [1] 0.7729944 0.2658974 0.9983454 0.9055118 0.1364265 0.5631251 0.1666813 0.4986730
#>  [9] 0.3231522 0.6208681
l[2:3]
#> $b
#> [1] "toto"
#> 
#> $c
#>  [1] 0.7729944 0.2658974 0.9983454 0.9055118 0.1364265 0.5631251 0.1666813 0.4986730
#>  [9] 0.3231522 0.6208681
head(iris)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa
head(iris[3:4])
#>   Petal.Length Petal.Width
#> 1          1.4         0.2
#> 2          1.4         0.2
#> 3          1.3         0.2
#> 4          1.5         0.2
#> 5          1.4         0.2
#> 6          1.7         0.4
class(iris[5])
#> [1] "data.frame"

You can also use a logical and character vectors to index these objects.

(x <- runif(4))
#> [1] 0.4454293 0.4200315 0.4739141 0.5814565
x[c(FALSE, TRUE, FALSE, TRUE)]
#> [1] 0.4200315 0.5814565
x[c(FALSE, TRUE)]  ## logical vectors are recycled
#> [1] 0.4200315 0.5814565
head(iris[c("Petal.Length", "Species")])
#>   Petal.Length Species
#> 1          1.4  setosa
#> 2          1.4  setosa
#> 3          1.3  setosa
#> 4          1.5  setosa
#> 5          1.4  setosa
#> 6          1.7  setosa
  1. The [[ accessor is used to access a single element.
(x <- 1:10)
#>  [1]  1  2  3  4  5  6  7  8  9 10
x[[3]]
#> [1] 3
l[[2]]
#> [1] "toto"
iris[["Species"]]
#>   [1] setosa     setosa     setosa     setosa     setosa     setosa     setosa    
#>   [8] setosa     setosa     setosa     setosa     setosa     setosa     setosa    
#>  [15] setosa     setosa     setosa     setosa     setosa     setosa     setosa    
#>  [22] setosa     setosa     setosa     setosa     setosa     setosa     setosa    
#>  [29] setosa     setosa     setosa     setosa     setosa     setosa     setosa    
#>  [36] setosa     setosa     setosa     setosa     setosa     setosa     setosa    
#>  [43] setosa     setosa     setosa     setosa     setosa     setosa     setosa    
#>  [50] setosa     versicolor versicolor versicolor versicolor versicolor versicolor
#>  [57] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
#>  [64] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
#>  [71] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
#>  [78] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
#>  [85] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
#>  [92] versicolor versicolor versicolor versicolor versicolor versicolor versicolor
#>  [99] versicolor versicolor virginica  virginica  virginica  virginica  virginica 
#> [106] virginica  virginica  virginica  virginica  virginica  virginica  virginica 
#> [113] virginica  virginica  virginica  virginica  virginica  virginica  virginica 
#> [120] virginica  virginica  virginica  virginica  virginica  virginica  virginica 
#> [127] virginica  virginica  virginica  virginica  virginica  virginica  virginica 
#> [134] virginica  virginica  virginica  virginica  virginica  virginica  virginica 
#> [141] virginica  virginica  virginica  virginica  virginica  virginica  virginica 
#> [148] virginica  virginica  virginica 
#> Levels: setosa versicolor virginica
Indexing lists in R. [Source: https://goo.gl/8UkcHq]

Figure 3.1: Indexing lists in R. [Source: https://goo.gl/8UkcHq]

  1. Beware partial matching
x <- list(aardvark = 1:5)
x$a
#> [1] 1 2 3 4 5
x[["a"]]
#> NULL
x[["a", exact = FALSE]]
#> [1] 1 2 3 4 5
  1. Special use of the [ accessor for array-like data.
(mat <- matrix(1:12, 3))
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
mat[1, ]
#> [1]  1  4  7 10
mat[, 1:2]
#>      [,1] [,2]
#> [1,]    1    4
#> [2,]    2    5
#> [3,]    3    6
mat[1, 1:2]
#> [1] 1 4
mat[1, 1:2, drop = FALSE]
#>      [,1] [,2]
#> [1,]    1    4
mat[cbind(1, 1:2)]
#> [1] 1 4
mat[,]
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
mat[]
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
mat[] <- 2
mat
#>      [,1] [,2] [,3] [,4]
#> [1,]    2    2    2    2
#> [2,]    2    2    2    2
#> [3,]    2    2    2    2

Just add a comma for every new dimension.

3.3.3 Exercises

  1. Use the dimension attribute to make a function that computes the sums every n elements of a vector. In which order are matrix elements stored? [Which are the special cases that you should consider?]

    advr38pkg::sum_every(1:10, 2)
    #> [1]  3  7 11 15 19
  2. Compute the means of every numeric columns of the iris dataset. Expected result:

    #> Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    #>     5.843333     3.057333     3.758000     1.199333
  3. Convert this matrix to a vector by replacing c(0,0) -> 0; c(0,1) -> 1; c(1,1) -> 2; c(1,0) -> NA

    mat <- matrix(0, 10, 2); mat[c(5, 8, 9, 12, 15, 16, 17, 19)] <- 1; mat
    #>       [,1] [,2]
    #>  [1,]    0    0
    #>  [2,]    0    1
    #>  [3,]    0    0
    #>  [4,]    0    0
    #>  [5,]    1    1
    #>  [6,]    0    1
    #>  [7,]    0    1
    #>  [8,]    1    0
    #>  [9,]    1    1
    #> [10,]    0    0

    by using this matrix:

    (decode <- matrix(c(0, NA, 1, 2), 2))
    #>      [,1] [,2]
    #> [1,]    0    1
    #> [2,]   NA    2

    Begin by using apply() and then replace it by a special accessor; what is the advantage?

    Expected result:

    #>  [1]  0  1  0  0  2  1  1 NA  2  0

3.4 Useful R base functions

In this section, I mention some useful R base functions (also see this comprehensive list in French and this one in English):

3.4.1 General

# To get some help
?topic
#> No documentation for 'topic' in specified packages and libraries:
#> you could try '??topic'
# Run code from the example section
example(sum)
#> 
#> sum> ## Pass a vector to sum, and it will add the elements together.
#> sum> sum(1:5)
#> [1] 15
#> 
#> sum> ## Pass several numbers to sum, and it also adds the elements.
#> sum> sum(1, 2, 3, 4, 5)
#> [1] 15
#> 
#> sum> ## In fact, you can pass vectors into several arguments, and everything gets added.
#> sum> sum(1:2, 3:5)
#> [1] 15
#> 
#> sum> ## If there are missing values, the sum is unknown, i.e., also missing, ....
#> sum> sum(1:5, NA)
#> [1] NA
#> 
#> sum> ## ... unless  we exclude missing values explicitly:
#> sum> sum(1:5, NA, na.rm = TRUE)
#> [1] 15
# Structure overview
str(iris)
#> 'data.frame':    150 obs. of  5 variables:
#>  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#>  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#>  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#>  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#>  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# List objects of environment
a <- 1:4; b <- list()
ls()
#>  [1] "a"               "b"               "c"               "d"              
#>  [5] "decode"          "l"               "mat"             "n"              
#>  [9] "vec"             "WORDS_TO_IGNORE" "x"               "y"
# Remove objects from the environment
rm(list = ls())  ## remove all objects in the environment
# list of available S3 methods
methods(summary)
#>  [1] summary.aov                    summary.aovlist*              
#>  [3] summary.aspell*                summary.check_packages_in_dir*
#>  [5] summary.connection             summary.data.frame            
#>  [7] summary.Date                   summary.default               
#>  [9] summary.ecdf*                  summary.factor                
#> [11] summary.glm                    summary.infl*                 
#> [13] summary.lm                     summary.loess*                
#> [15] summary.manova                 summary.matrix                
#> [17] summary.mlm*                   summary.nls*                  
#> [19] summary.packageStatus*         summary.PDF_Dictionary*       
#> [21] summary.PDF_Stream*            summary.POSIXct               
#> [23] summary.POSIXlt                summary.ppr*                  
#> [25] summary.prcomp*                summary.princomp*             
#> [27] summary.proc_time              summary.srcfile               
#> [29] summary.srcref                 summary.stepfun               
#> [31] summary.stl*                   summary.table                 
#> [33] summary.tukeysmooth*          
#> see '?methods' for accessing help and source code
methods(class = "lm")
#>  [1] add1           alias          anova          case.names     coerce        
#>  [6] confint        cooks.distance deviance       dfbeta         dfbetas       
#> [11] drop1          dummy.coef     effects        extractAIC     family        
#> [16] formula        hatvalues      influence      initialize     kappa         
#> [21] labels         logLik         model.frame    model.matrix   nobs          
#> [26] plot           predict        print          proj           qr            
#> [31] residuals      rstandard      rstudent       show           simulate      
#> [36] slotsFromS3    summary        variable.names vcov          
#> see '?methods' for accessing help and source code
# Call a function with arguments as a list
(list_of_int <- lapply(1:5, function(i) i))
#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] 2
#> 
#> [[3]]
#> [1] 3
#> 
#> [[4]]
#> [1] 4
#> 
#> [[5]]
#> [1] 5
do.call('c', list_of_int)
#> [1] 1 2 3 4 5

3.4.2 Sequence and vector operations

1:10  ## of type integer
#>  [1]  1  2  3  4  5  6  7  8  9 10
seq(1, 10, by = 2)  ## of type double
#> [1] 1 3 5 7 9
seq(1, 100, length.out = 10)
#>  [1]   1  12  23  34  45  56  67  78  89 100
seq_len(5)
#> [1] 1 2 3 4 5
seq_along(21:24)
#> [1] 1 2 3 4
rep(1:4, 2)
#> [1] 1 2 3 4 1 2 3 4
rep(1:4, each = 2)
#> [1] 1 1 2 2 3 3 4 4
rep(1:4, 4:1)
#>  [1] 1 1 1 1 2 2 2 3 3 4
rep_len(1:3, 8)
#> [1] 1 2 3 1 2 3 1 2
replicate(5, rnorm(10))  ## How to use a multiline expression?
#>             [,1]        [,2]        [,3]         [,4]       [,5]
#>  [1,] -0.5088130  1.49534876 -1.57672742  1.209146592 -0.2330422
#>  [2,]  1.8018798  0.20847523 -1.19125327  1.781712985  1.7505097
#>  [3,]  1.7394319 -0.86809087  1.04378069  0.006355165 -1.0317443
#>  [4,] -0.5260837  0.89470546  2.22787831 -0.781838677  0.2569092
#>  [5,] -1.2306774 -1.36646753  0.85436122  0.357834120 -0.4859491
#>  [6,]  0.8516872  1.16793210 -0.04125081 -0.724865082 -1.0674001
#>  [7,]  1.8707400  2.18597997 -0.73642035 -0.373385460 -0.1700187
#>  [8,]  1.8103187  1.41278136  1.61459563  0.621432210  0.2580942
#>  [9,]  0.5264246  0.05163903  1.41141023  0.633884566  0.8802899
#> [10,] -0.6339748  1.97101677  0.67260521 -0.391832393 -0.9342480
sort(c(1, 6, 8, 2, 2))
#> [1] 1 2 2 6 8
order(c(1, 6, 8, 2, 2), c(0, 0, 0, 2, 1))
#> [1] 1 5 4 2 3
rank(c(1, 6, 8, 2, 2))
#> [1] 1.0 4.0 5.0 2.5 2.5
rank(c(1, 6, 8, 2, 2), ties.method = "first")
#> [1] 1 4 5 2 3
sort(c("a1", "a2", "a10"))
#> [1] "a1"  "a10" "a2"
gtools::mixedsort(c("a1", "a2", "a10"))  ## not in base, obviously
#> [1] "a1"  "a2"  "a10"
which.max(c(1, 5, 3, 6, 2, 0))
#> [1] 4
which.min(c(1, 5, 3, 6, 2, 0))
#> [1] 6
unique(c(1, NA, 2, 3, 2, NA, 3))
#> [1]  1 NA  2  3
table(rep(1:4, 4:1))
#> 
#> 1 2 3 4 
#> 4 3 2 1
table(A = c(1, 1, 1, 2, 2), B = c(1, 2, 1, 2, 1))
#>    B
#> A   1 2
#>   1 2 1
#>   2 1 1
sample(10)
#>  [1] 10  2  1  9  3  6  5  8  7  4
sample(3:10, 5)
#> [1]  9  6 10  7  4
sample(3:10, 50, replace = TRUE)
#>  [1] 10  9 10  7  9  3  4  6  9  3  8  5  8  4 10 10  9  8  8  3  6  8  7  9  6  7  3  3
#> [29]  4  9  5  6  3  4  4  7  5  3 10 10  5  5  8  8  7  5  3 10 10  4
round(x <- runif(10, max = 100))
#>  [1] 86 79 41 61 86 91 81 44 17 95
round(x, digits = 2)
#>  [1] 86.13 79.26 41.10 60.97 86.07 91.39 81.11 43.85 17.21 95.41
round(x, -1)
#>  [1]  90  80  40  60  90  90  80  40  20 100
pmin(1:4, 4:1)
#> [1] 1 2 2 1
pmax(1:4, 4:1)
#> [1] 4 3 3 4
outer(1:4, 1:3, '+')
#>      [,1] [,2] [,3]
#> [1,]    2    3    4
#> [2,]    3    4    5
#> [3,]    4    5    6
#> [4,]    5    6    7
expand.grid(param1 = c(5, 50), param2 = c(1, 3, 10))
#>   param1 param2
#> 1      5      1
#> 2     50      1
#> 3      5      3
#> 4     50      3
#> 5      5     10
#> 6     50     10

Also see this nice Q/A on grouping functions and the *apply family and this book chapter about looping.

3.4.3 Character operations

paste("I", "am", "me")
#> [1] "I am me"
paste0("test", 0)
#> [1] "test0"
paste0("PC", 1:10)
#>  [1] "PC1"  "PC2"  "PC3"  "PC4"  "PC5"  "PC6"  "PC7"  "PC8"  "PC9"  "PC10"
me <- "Florian"
glue::glue("I am {me}")  ## not in base, but SO useful
#> I am Florian
(x <- list.files(pattern = "\\.Rmd$", full.names = TRUE))
#> [1] "./good-practices.Rmd" "./index.Rmd"          "./intro.Rmd"         
#> [4] "./packages.Rmd"       "./performance.Rmd"    "./rprog.Rmd"         
#> [7] "./shiny.Rmd"          "./tidyverse.Rmd"
sub("\\.Rmd$", ".pdf", x)
#> [1] "./good-practices.pdf" "./index.pdf"          "./intro.pdf"         
#> [4] "./packages.pdf"       "./performance.pdf"    "./rprog.pdf"         
#> [7] "./shiny.pdf"          "./tidyverse.pdf"
(y <- sample(letters[1:4], 10, replace = TRUE))
#>  [1] "a" "a" "a" "a" "c" "b" "a" "a" "a" "a"
match(y, letters[1:4])
#>  [1] 1 1 1 1 3 2 1 1 1 1
y %in% letters[1:2]
#>  [1]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
rle(rep(1:4, 4:1))
#> Run Length Encoding
#>   lengths: int [1:4] 4 3 2 1
#>   values : int [1:4] 1 2 3 4
split(1:12, rep(letters[1:3], 4))
#> $a
#> [1]  1  4  7 10
#> 
#> $b
#> [1]  2  5  8 11
#> 
#> $c
#> [1]  3  6  9 12
intersect(letters[1:4], letters[3:5])
#> [1] "c" "d"
union(letters[1:4], letters[3:5])
#> [1] "a" "b" "c" "d" "e"
setdiff(letters[1:4], letters[3:5])
#> [1] "a" "b"

3.4.4 Logical operators

tryCatch(TRUE | stop("will go there"), error = function(e) print(e))
#> <simpleError in doTryCatch(return(expr), name, parentenv, handler): will go there>
TRUE || stop("won't go there")  ## won't evaluate second condition if first one is TRUE
#> [1] TRUE
c(TRUE, FALSE, TRUE, TRUE) & c(FALSE, TRUE, TRUE, TRUE) 
#> [1] FALSE FALSE  TRUE  TRUE
c(TRUE, FALSE, TRUE, TRUE) && c(FALSE, TRUE, TRUE, TRUE)  ## no warning!!
#> [1] FALSE
(x <- rnorm(10))
#>  [1]  0.36627318 -0.94538659 -0.37411320 -0.49420286 -0.46650356 -0.12747210  0.04273725
#>  [8] -0.07248727 -0.20516713  1.65278545
ifelse(x > 0, x, -x)
#>  [1] 0.36627318 0.94538659 0.37411320 0.49420286 0.46650356 0.12747210 0.04273725
#>  [8] 0.07248727 0.20516713 1.65278545

Beware with ifelse() (learn more there), for example

ifelse(FALSE, 0, 1:5)
#> [1] 1
`if`(FALSE, 0, 1:5)
#> [1] 1 2 3 4 5

3.4.5 Exercises

  1. Use sample(), rep_len() and split() to make a function that randomly splits some indices in a list of K groups of indices. [Which are the special cases that you should consider?]

    advr38pkg::split_ind(1:40, 3)
    #> $`1`
    #>  [1]  1  6  9 18 19 20 24 25 28 31 32 35 36 38
    #> 
    #> $`2`
    #>  [1]  7 10 13 14 16 17 21 30 33 34 37 39 40
    #> 
    #> $`3`
    #>  [1]  2  3  4  5  8 11 12 15 22 23 26 27 29
  2. Use replicate() and sample() to get a 95% confidence interval for the mean of the following vector:

    set.seed(1)
    (x <- rnorm(10))
    #>  [1] -0.6264538  0.1836433 -0.8356286  1.5952808  0.3295078 -0.8204684  0.4874291
    #>  [8]  0.7383247  0.5757814 -0.3053884
    mean(x)
    #> [1] 0.1322028
    #>       2.5%      97.5% 
    #> -0.3150315  0.5994491
  3. Use match() and some accessor to add a column “my_val” to this data my_mtcars by putting the corresponding value of the column specified in “my_col”. [Can your solution be used for any number of column names?]

    my_mtcars <- mtcars[c("mpg", "hp")]
    my_mtcars$my_col <- sample(c("mpg", "hp"), size = nrow(my_mtcars), replace = TRUE)
    head(my_mtcars)
    #>                    mpg  hp my_col
    #> Mazda RX4         21.0 110    mpg
    #> Mazda RX4 Wag     21.0 110     hp
    #> Datsun 710        22.8  93     hp
    #> Hornet 4 Drive    21.4 110    mpg
    #> Hornet Sportabout 18.7 175     hp
    #> Valiant           18.1 105    mpg

    Expected result (head):

    #>                    mpg  hp my_col my_val
    #> Mazda RX4         21.0 110    mpg   21.0
    #> Mazda RX4 Wag     21.0 110     hp    110
    #> Datsun 710        22.8  93     hp     93
    #> Hornet 4 Drive    21.4 110    mpg   21.4
    #> Hornet Sportabout 18.7 175     hp    175
    #> Valiant           18.1 105    mpg   18.1
  4. In the following data frame (recall that it is also a list), for the first 3 columns, replace letters by corresponding numbers based on the code:

    df <- data.frame(
      id1 = c("a", "f", "a"),
      id2 = c("b", "e", "e"), 
      id3 = c("c", "d", "f"),
      inter = c(7.343, 2.454, 3.234),
      stringsAsFactors = FALSE
    )
    df
    #>   id1 id2 id3 inter
    #> 1   a   b   c 7.343
    #> 2   f   e   d 2.454
    #> 3   a   e   f 3.234
    (code <- setNames(1:6, letters[1:6]))
    #> a b c d e f 
    #> 1 2 3 4 5 6

    Expected result:

    #>   id1 id2 id3 inter
    #> 1   1   2   3 7.343
    #> 2   6   5   4 2.454
    #> 3   1   5   6 3.234

3.5 Environments and scoping

Lexical scoping determines where to look for values, not when to look for them. R looks for values when the function is run, not when it’s created. This means that the output of a function can be different depending on objects outside its environment:

h <- function() {
  x <- 10
  g <- function() {
    f <- function() {
      x + 1
    }
    print(environment(f))
    print(ls(environment(f)))  ## list the objects in the environment
    print(parent.env(environment(f)))  ## environment(g)
    f()
  }
  print(environment(g))
  print(ls(environment(g)))
  print(parent.env(environment(g)))
  g()
}
x <- 100
h()
#> <environment: 0x62ef478>
#> [1] "g" "x"
#> <environment: R_GlobalEnv>
#> <environment: 0x67fc630>
#> [1] "f"
#> <environment: 0x62ef478>
#> [1] 11

Variable x is not defined inside f so R will look at the environment of f (where f was defined) and then at the parent environment, and so on. Here, the first x that is found has value 10.

Be aware that for functions, packages environments are checked last so that you can redefine functions without noticing.

c <- function(...) paste0(...)
c(1, 2, 3)
#> [1] "123"
base::c(1, 2, 3)  ## you need to explicit the package
#> [1] 1 2 3
rm(c)  ## remove the new function from the environment
c(1, 2, 3)
#> [1] 1 2 3

You can use the <<- operator to change the value of an object in an upper environment:

count1 <- 0
count2 <- 0
f <- function(i) {
  count1 <-  count1 + 1  ## will assign a new (temporary) count1
  count2 <<- count2 + 1  ## will assign count2 on top
  i + 1
}
sapply(1:10, f)
#>  [1]  2  3  4  5  6  7  8  9 10 11
count1
#> [1] 0
count2
#> [1] 10

Finally, how works the ...? Basically, you copy and paste what is put in ...:

f1 <- function(...) {
  list(...)
}
f1(a = 2, b = 3)
#> $a
#> [1] 2
#> 
#> $b
#> [1] 3
list(a = 2, b = 3)
#> $a
#> [1] 2
#> 
#> $b
#> [1] 3

Learn more about functions and scoping rules of R with the R Programming for Data Science book.

3.6 Attributes and classes

Attributes are metadata associated with an object. You can get/set the list of attributes with attributes() or one particular attribute with attr().

attributes(iris)
#> $names
#> [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"     
#> 
#> $row.names
#>   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21
#>  [22]  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42
#>  [43]  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
#>  [64]  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84
#>  [85]  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105
#> [106] 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
#> [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
#> [148] 148 149 150
#> 
#> $class
#> [1] "data.frame"
class(iris)
#> [1] "data.frame"
attr(iris, "row.names")
#>   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21
#>  [22]  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42
#>  [43]  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63
#>  [64]  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84
#>  [85]  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105
#> [106] 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
#> [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
#> [148] 148 149 150

You can use structure() to create an object and add some arbitrary attributes.

structure(1:10, my_fancy_attribute = "blabla")
#>  [1]  1  2  3  4  5  6  7  8  9 10
#> attr(,"my_fancy_attribute")
#> [1] "blabla"

There are also some attributes with specific accessor functions to get and set values. For example, use names(x), dim(x) and class(x) instead of attr(x, "names"), attr(x, "dim") and attr(x, "class").


class(mylm <- lm(Sepal.Length ~ ., data = iris))
#> [1] "lm"

I’ve just fitted a linear model in order to predict the sepal length variable of the iris dataset based on the other variables. Using lm() gets me an object of class lm. What are the methods I can use for this object?

methods(class = class(mylm))
#>  [1] add1           alias          anova          case.names     coerce        
#>  [6] confint        cooks.distance deviance       dfbeta         dfbetas       
#> [11] drop1          dummy.coef     effects        extractAIC     family        
#> [16] formula        hatvalues      influence      initialize     kappa         
#> [21] labels         logLik         model.frame    model.matrix   nobs          
#> [26] plot           predict        print          proj           qr            
#> [31] residuals      rstandard      rstudent       show           simulate      
#> [36] slotsFromS3    summary        variable.names vcov          
#> see '?methods' for accessing help and source code
summary(mylm)
#> 
#> Call:
#> lm(formula = Sepal.Length ~ ., data = iris)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.79424 -0.21874  0.00899  0.20255  0.73103 
#> 
#> Coefficients:
#>                   Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)        2.17127    0.27979   7.760 1.43e-12 ***
#> Sepal.Width        0.49589    0.08607   5.761 4.87e-08 ***
#> Petal.Length       0.82924    0.06853  12.101  < 2e-16 ***
#> Petal.Width       -0.31516    0.15120  -2.084  0.03889 *  
#> Speciesversicolor -0.72356    0.24017  -3.013  0.00306 ** 
#> Speciesvirginica  -1.02350    0.33373  -3.067  0.00258 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.3068 on 144 degrees of freedom
#> Multiple R-squared:  0.8673, Adjusted R-squared:  0.8627 
#> F-statistic: 188.3 on 5 and 144 DF,  p-value: < 2.2e-16
plot(mylm)


R has the easiest way to create a class and to use methods on objects of this class; it is called S3. If you want to know more about the other types of classes, see the Advanced R book.

agent007 <- list(first = "James", last = "Bond")
agent007
#> $first
#> [1] "James"
#> 
#> $last
#> [1] "Bond"
class(agent007) <- "Person"  ## "agent007" is now an object of class "Person"
# Just make a function called <method_name>.<class_name>()
print.Person <- function(x) {
  print(glue::glue("My name is {x$last}, {x$first} {x$last}."))
  invisible(x)
}

agent007
#> My name is Bond, James Bond.
# Constructor of class as simple function
Person <- function(first, last) {
  structure(list(first = first, last = last), class = "Person")
}
(me <- Person("Florian", "Privé"))
#> My name is Privé, Florian Privé.

An object can have many classes:

Worker <- function(first, last, job) {
  obj <- Person(first, last)
  obj$job <- job
  class(obj) <- c("Worker", class(obj))
  obj
}
print.Worker <- function(x) {
  print.Person(x) 
  print(glue::glue("I am a {x$job}."))
  invisible(x)
}

(worker_007 <- Worker("James", "Bond", "secret agent"))
#> My name is Bond, James Bond.
#> I am a secret agent.
(worker_me <- Worker("Florian", "Privé", "PhD student"))
#> My name is Privé, Florian Privé.
#> I am a PhD student.