# Chapter 3 Working with an FBM

## 3.1 Similar accessor as R matrices

library(bigstatsr)
X <- FBM(2, 5, init = 1:10, backingfile = "test")$save() X$backingfile                ## the file where the data is actually stored
#> [1] "C:\\Users\\au639593\\Desktop\\bigsnpr-extdoc\\test.bk"
X <- big_attach("test.rds")  ## can get the FBM from any R session
X[, 1]  ## ok
#> [1] 1 2
#> [1] 1 3 5 7 9
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]    1    3    5    7    9
#> [2,]    2    4    6    8   10

You can access the whole FBM as an R matrix in memory using X[]. However, if the matrix is too large to fit in memory, you should always access only a subset of columns. Note that the elements of the FBM are stored column-wise (as for a standard R matrix). Therefore, be careful not to access a subset of rows, since it would read non-contiguous elements from the whole matrix from disk.

## 3.2 Split-(par)Apply-Combine Strategy

#> [1]  3  7 11 15 19

How to apply standard R functions to big matrices (in parallel); implemented in big_apply().

Compute the sum of each column of X <- big_attachExtdata() using big_apply().

## 3.3 Similar accessor as Rcpp matrices

// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::depends(bigstatsr, rmio)]]
#include <bigstatsr/BMAcc.h>

// [[Rcpp::export]]
NumericVector bigcolsums(Environment BM) {

XPtr<FBM> xpBM = BM["address"];   // get the external pointer
BMAcc<double> macc(xpBM);         // create an accessor to the data

size_t n = macc.nrow();           // similar code as for an Rcpp::NumericMatrix
size_t m = macc.ncol();           // similar code as for an Rcpp::NumericMatrix

NumericVector res(m);

for (size_t j = 0; j < m; j++)
for (size_t i = 0; i < n; i++)
res[j] += macc(i, j);         // similar code as for an Rcpp::NumericMatrix

return res;
}

For a subset of the data:

// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::depends(bigstatsr, rmio)]]
#include <bigstatsr/BMAcc.h>

// [[Rcpp::export]]
NumericVector bigcolsums2(Environment BM,
const IntegerVector& rowInd,
const IntegerVector& colInd) {

// accessor to a sub-view of the data -> the only line of code that should change
SubBMAcc<double> macc(xpBM, rowInd, colInd, 1);

size_t n = macc.nrow();
size_t m = macc.ncol();

NumericVector res(m);

for (size_t j = 0; j < m; j++)
for (size_t i = 0; i < n; i++)
res[j] += macc(i, j);

return res;
}

## 3.4 Some summary functions are already implemented

big_colstats(X)  # sum and var (for each column)
#>   sum var
#> 1   3 0.5
#> 2   7 0.5
#> 3  11 0.5
#> 4  15 0.5
#> 5  19 0.5
big_scale()(X)   # mean and sd (for each column)
#>   center     scale
#> 1    1.5 0.7071068
#> 2    3.5 0.7071068
#> 3    5.5 0.7071068
#> 4    7.5 0.7071068
#> 5    9.5 0.7071068

To only use a subset of the data stored as an FBM, you should almost never make a copy of the data; instead, use parameters ind.row (or ind.train) and ind.col to apply functions to a subset of the data.