In this vignette, you learn how to read a Filebacked Big Matrix from a text file. Package {bigreadr} is required.

Data

library(bigstatsr)
library(bigreadr)

## LONG CSV
df <- datasets::mtcars
csv <- fwrite2(df[rep(seq_len(nrow(df)), 500000), ], 
               tempfile(fileext = ".csv"), 
               row.names = TRUE)
format(file.size(csv), big.mark = ",")
## [1] "932,944,462"

Check file content

nlines(csv)
## [1] 1.6e+07
(first_rows <- fread2(csv, nrows = 5))
##                  V1  mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1         Mazda RX4 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2     Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3        Datsun 710 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4    Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
sapply(first_rows, typeof)
##          V1         mpg         cyl        disp          hp        drat 
## "character"    "double"   "integer"   "integer"   "integer"    "double" 
##          wt        qsec          vs          am        gear        carb 
##    "double"    "double"   "integer"   "integer"   "integer"   "integer"
ncol(first_rows)
## [1] 12

What you can see with these first lines:

  • there are some column names (header),
  • each line has 12 elements,
  • the first element of each line stores some row names,
  • and the other elements seems to be all numeric.

Read those data

  • Read all numeric columns in an FBM

    (test <- big_read(csv, select = 2:12))
    ## A Filebacked Big Matrix of type 'double' with 16000000 rows and 11 columns.
    rbind(csv, test$backingfile)
    ##     [,1]                                  
    ## csv "/tmp/Rtmp4o5mzH/file528e7b0c0b14.csv"
    ##     "/tmp/Rtmp4o5mzH/file528e7b0c0b14.bk"
    attr(test, "fbm_names")
    ##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
    ## [11] "carb"
  • Read other non-numeric data afterwards

    meta <- fread2(csv, select = 1)
    head(meta)
    ##                  V1
    ## 1         Mazda RX4
    ## 2     Mazda RX4 Wag
    ## 3        Datsun 710
    ## 4    Hornet 4 Drive
    ## 5 Hornet Sportabout
    ## 6           Valiant

Read filtered data

## A Filebacked Big Matrix of type 'double' with 5500000 rows and 11 columns.