- for 5 of you:
- Data visualization
- for 4 of you:
- Fast code (via C++)
- for 3 of you:
- Use of out-of-RAM matrices (Big Data)
- Interactive plots
- Interactive apps (Shiny)
- Some features of RStudio (THE IDE of R)
2018-05-25
We will try to see a bit of everything.
Contents:
R packages that he has developped (from his website):
p_load(nycflights13) dim(flights) [1] 336776 19 head(flights) # A tibble: 6 × 19 year month day dep_time sched_dep_time dep_delay arr_time <int> <int> <int> <int> <int> <dbl> <int> 1 2013 1 1 517 515 2 830 2 2013 1 1 533 529 4 850 3 2013 1 1 542 540 2 923 4 2013 1 1 544 545 -1 1004 5 2013 1 1 554 600 -6 812 6 2013 1 1 554 558 -4 740 # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>, # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, # time_hour <dttm>
p_load(dplyr)
Dplyr aims to provide a function for each basic verb of data manipulation:
filter()
(and slice()
)arrange()
select()
(and rename()
)distinct()
mutate()
(and transmute()
)summarise()
sample_n()
(and sample_frac()
)filter(flights, month == 1, day == 1)
# A tibble: 842 × 19 year month day dep_time sched_dep_time dep_delay arr_time <int> <int> <int> <int> <int> <dbl> <int> 1 2013 1 1 517 515 2 830 2 2013 1 1 533 529 4 850 3 2013 1 1 542 540 2 923 4 2013 1 1 544 545 -1 1004 5 2013 1 1 554 600 -6 812 6 2013 1 1 554 558 -4 740 7 2013 1 1 555 600 -5 913 8 2013 1 1 557 600 -3 709 9 2013 1 1 557 600 -3 838 10 2013 1 1 558 600 -2 753 # ... with 832 more rows, and 12 more variables: sched_arr_time <int>, # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, # minute <dbl>, time_hour <dttm>
arrange(flights, desc(dep_delay))
# A tibble: 336,776 × 19 year month day dep_time sched_dep_time dep_delay arr_time <int> <int> <int> <int> <int> <dbl> <int> 1 2013 1 9 641 900 1301 1242 2 2013 6 15 1432 1935 1137 1607 3 2013 1 10 1121 1635 1126 1239 4 2013 9 20 1139 1845 1014 1457 5 2013 7 22 845 1600 1005 1044 6 2013 4 10 1100 1900 960 1342 7 2013 3 17 2321 810 911 135 8 2013 6 27 959 1900 899 1236 9 2013 7 22 2257 759 898 121 10 2013 12 5 756 1700 896 1058 # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>, # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, # minute <dbl>, time_hour <dttm>
mutate(flights, gain = arr_delay - dep_delay, speed = distance / air_time * 60)
# A tibble: 336,776 × 21 year month day dep_time sched_dep_time dep_delay arr_time <int> <int> <int> <int> <int> <dbl> <int> 1 2013 1 1 517 515 2 830 2 2013 1 1 533 529 4 850 3 2013 1 1 542 540 2 923 4 2013 1 1 544 545 -1 1004 5 2013 1 1 554 600 -6 812 6 2013 1 1 554 558 -4 740 7 2013 1 1 555 600 -5 913 8 2013 1 1 557 600 -3 709 9 2013 1 1 557 600 -3 838 10 2013 1 1 558 600 -2 753 # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>, # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, # minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>
flights2 <- flights %>% filter(month == 1, day == 1) %>% arrange(desc(dep_delay)) %>% mutate(gain = arr_delay - dep_delay, speed = distance / air_time * 60) print(flights2, n = 6)
# A tibble: 842 × 21 year month day dep_time sched_dep_time dep_delay arr_time <int> <int> <int> <int> <int> <dbl> <int> 1 2013 1 1 848 1835 853 1001 2 2013 1 1 2343 1724 379 314 3 2013 1 1 1815 1325 290 2120 4 2013 1 1 2205 1720 285 46 5 2013 1 1 1842 1422 260 1958 6 2013 1 1 2115 1700 255 2330 # ... with 836 more rows, and 14 more variables: sched_arr_time <int>, # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, # minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>
p_load(ggplot2) p <- qplot(dep_delay, arr_delay, data = flights2, main = "Flights which take off late arrive late. Surprising!") print(p)
p + geom_smooth()
citation("ggplot2")
To cite ggplot2 in publications, please use: H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009. A BibTeX entry for LaTeX users is @Book{, author = {Hadley Wickham}, title = {ggplot2: Elegant Graphics for Data Analysis}, publisher = {Springer-Verlag New York}, year = {2009}, isbn = {978-0-387-98140-6}, url = {http://ggplot2.org}, }
p_load(ggExtra) ggMarginal(p, type = "histogram")
p_load(plotly) ggplotly(p)
Looking for inspiration or help concerning data visualisation with R? Go check the R graph gallery!
Live demo!
shiny::runGitHub("privefl/repartitions_equipes")
More advanced usage: Advanced tips and tricks
More infos there
Typical bottlenecks that C++ can address include:
sumR <- function(x) { total <- 0 for (i in seq_along(x)) { total <- total + x[i] } total }
In Rcpp:
#include <Rcpp.h> using namespace Rcpp; // [[Rcpp::export]] double sumC(NumericVector x) { int n = x.size(); double total = 0; for(int i = 0; i < n; ++i) { total += x[i]; } return total; }
In Rcpp Sugar:
#include <Rcpp.h> using namespace Rcpp; // [[Rcpp::export]] double sumCS(NumericVector x) { return sum(x); }
Microbenchmark:
p_load(microbenchmark) x <- runif(1e3) microbenchmark( sum(x), sumC(x), sumCS(x), sumR(x) )
Unit: microseconds expr min lq mean median uq max neval sum(x) 1.624 1.9160 2.25361 2.0205 2.1310 6.616 100 sumC(x) 3.501 4.0135 5.45615 4.3500 5.2285 27.545 100 sumCS(x) 3.567 4.0395 4.97736 4.4040 5.1380 16.425 100 sumR(x) 483.268 504.4415 586.03832 548.2945 616.3200 1926.567 100
gibbs_r <- function(N, thin) { mat <- matrix(nrow = 2, ncol = N) x <- y <- 0 for (i in 1:N) { for (j in 1:thin) { x <- rgamma(1, 3, y * y + 4) y <- rnorm(1, 1 / (x + 1), 1 / sqrt(2 * (x + 1))) } mat[, i] <- c(x, y) } mat }
#include <Rcpp.h> using namespace Rcpp; // [[Rcpp::export]] NumericMatrix gibbs_cpp(int N, int thin) { NumericMatrix mat(2, N); double x = 0, y = 0; for(int i = 0; i < N; i++) { for(int j = 0; j < thin; j++) { x = rgamma(1, 3, 1 / (y * y + 4))[0]; y = rnorm(1, 1 / (x + 1), 1 / sqrt(2 * (x + 1)))[0]; } mat(0, i) = x; mat(1, i) = y; } return(mat); }
microbenchmark( gibbs_r(100, 10), gibbs_cpp(100, 10) )
Unit: microseconds expr min lq mean median uq gibbs_r(100, 10) 5961.415 6185.3970 7178.3434 6526.775 7090.976 gibbs_cpp(100, 10) 257.184 277.0105 302.6162 287.633 301.857 max neval 45224.159 100 1470.401 100
char
, short
, int
, float
, double
[i, j]
as a matrix[j][i]
DO_all <- function(X, infos, ncores) { DO_chr <- function(X.desc, lims) { X.chr <- sub.big.matrix(X.desc, firstCol = lims[1], lastCol = lims[2], backingpath = "backingfiles") ## Do something with X.chr (such as imputing) } range.chr <- LimsChr(infos) X.desc <- describe(X) obj <- foreach(chr = 1:nrow(range.chr), .packages = "bigmemory") expr_fun <- function(chr) { DO_chr(X.desc, range.chr[chr, ]) } res <- foreach2(obj, expr_fun, ncores) }
LimsChr <- function(infos) { map.rle <- rle(infos$map$chromosome) upper <- cumsum(map.rle$length) lower <- c(1, upper[-length(upper)] + 1) cbind(lower, upper, "chr" = map.rle$values) } foreach2 <- function(obj, expr_fun, ncores) { if (is.seq <- (ncores == 1)) { foreach::registerDoSEQ() } else { cl <- parallel::makeCluster(ncores) doParallel::registerDoParallel(cl) } res <- eval(parse( text = sprintf("foreach::`%%dopar%%`(obj, expr_fun(%s))", obj$argnames))) if (!is.seq) parallel::stopCluster(cl) return(res) }
Live demo!
More tips: RStudio Tips on Twitter
Learn: R Course Finder