BAM files

Standard BAM file
Standard BAM file

Exported as tsv, but when imported in R

readr:: read_delim( "banalyzer/data_raw/data/cellranger_big.txt",
            delim = "\t", escape_double = FALSE,
            col_names = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 3698 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): X1, X3, X6, X7, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, ...
## dbl  (5): X2, X4, X5, X8, X9
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Bam converter 10x function

#' 10x Bam converter
#' takes a tsv file from a cellranger bam file as input and returns a data frame.
#'
#' @param text.file File address of a cellranger bam file transformed in tsv format.
#'
#' @return A data frame with single reads as rows and bam file parameters as columns
#' @export
#' @import dplyr
#'
#' @examples bam.converter.10x(system.file("extdata/cellrangerdata.txt", package = "banalyzer"))

bam.converter.10x <- function(text.file){
  
  # Read bam file as tab delimited file in. Suppresses warnings and messages.
  
  raw.table <- suppressMessages(suppressWarnings(
               readr:: read_delim(text.file,
                                  delim = "\t", escape_double = FALSE,
                                  col_names = FALSE, trim_ws = TRUE)))
  
  # Collect information about which combinations of informations (pattern) is
  # available for each read (= each row). It achieves it by collecting the 2
  # letter code for each informaiton in a new "pattern" column. This is true
  # for all the column after the number 11, as the first 11 columns are always
  # the same
  
  raw.table["pattern"] <- 
    # Find which information each observation contains and in which order
    apply(raw.table[12:ncol(raw.table)], 2, 
          function(x) gsub("(^..)(.+$)", "\\1", x)) %>%
    # Collapse the pattern of information and append it to each read as a new
    # variable
    apply(1, function(x) paste(x, collapse = " "))
  
  # Extract the possible pattern combinations
  pattern_list <- unique(raw.table$pattern)
  
  # Create a list of all the existing information (abbreviation) in the table,
  # without NAs
  full_pattern <- 
    # Collapse the pattern list to a single string
    paste(pattern_list, collapse = " ") %>%
    # Split the single string to a list of n elements
    strsplit(" ") %>%
    # Transform the list to a vector
    unlist() %>% 
    # Keep only one cpy of each abbreviation
    unique()
  
  full_pattern <- unique(grep("NA", full_pattern, value = TRUE, invert = TRUE))
  
  # Define column names of final table
  
  # Common names are the same for all BAM files ( the first 11 columns are fixed)
  common.names <- c("QName", "FLAG", "Ref_name", "Left_pos", 
                    "Map_Quality", "CIGAR", "Ref_name_mate", 
                    "Pos_name_read", "Temp_length", "Sequence", "Read_Quality")
  # The column names at the end will be the union of common variables and the full pattern of variables
  full_names <- c(common.names, full_pattern)
  
  # Split the raw table into a list of tables that share the same patern of values
  # i.e., will have the same number of columns with non-NA values
  pattern.divided.table <- raw.table %>%
    group_by(pattern) %>% 
    group_split()
  
  # Create a function that:
  # 1. Identifies which variables are present in each pattern group
  single.table.adaptor <- function(df){
    
    # Assign column names for this table - the column order is unchanged since 
    # the pattern was created - thus the order in the pattern is the same as the
    # order of the variables contained in the columns
    specific.names <- strsplit(df$pattern[1], " ") %>% unlist()
    tab.names <- c(common.names, specific.names)
    colnames(df) <- tab.names
    
    # Remove columns named NA
    df <- df[!colnames(df) %in% ("NA")]
    
    # Substitute with the real column names that are in the full pattern, but are
    # not in the subtable yet
    missing <- setdiff(full_names, colnames(df))
    df[missing] <- NA
    
    # Remove the prefix (the pattern that asigned the information) from the cells
    prefix.remover <- function(complex.string){
      
      # Remove everything before the second ":"
      sub("^..:.:", "", complex.string)
      
    }
    
    # Apply the prefix remover to columns 12 to end (the ones with prefix).
    # However, the function's result is weird, if only one row is available,
    # therefore the two different conditions
    if(nrow(df) > 1){
      
      df[12:ncol(df)] <-  apply(df[12:ncol(df)], 2, prefix.remover)
      
    } else {
      
      df[1,12:ncol(df)] <-  lapply(df[1,12:ncol(df)], prefix.remover)
      
    }
    
    return(df)
    
  }
  
  pattern.divided.table <- lapply(pattern.divided.table, single.table.adaptor)
  
  
  
  # Merge the pattern divided tables into one table
  output <- Reduce(function(...) merge(...,all=TRUE), pattern.divided.table)
  
  ## Reorder columns to standard sequence
  col_order <- c(common.names, full_pattern)
  output <- output[,col_order]
  
  ## Replace eventual "-" with NA
  output[output == "-"] <- "NA"
  
  ## Print output table
  return(output)
  
}

bam.converter.10x("banalyzer/data_raw/data/cellranger_big.txt")