BAM files

Standard BAM file
Standard BAM file

Exported as tsv, but when imported in R

readr:: read_delim( "banalyzer/data_raw/data/cellranger_big.txt",
            delim = "\t", escape_double = FALSE,
            col_names = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 3698 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): X1, X3, X6, X7, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, ...
## dbl  (5): X2, X4, X5, X8, X9
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Bam converter 10x function

#' 10x Bam converter
#' takes a tsv file from a cellranger bam file as input and returns a data frame.
#'
#' @param text.file File address of a cellranger bam file transformed in tsv format.
#'
#' @return A data frame with single reads as rows and bam file parameters as columns
#' @export
#' @import dplyr
#'
#' @examples bam.converter.10x(system.file("extdata/cellrangerdata.txt", package = "banalyzer"))

bam.converter.10x <- function(text.file){
  
  # Read bam file as tab delimited file in. Suppresses warnings and messages.
  
  raw.table <- suppressMessages(suppressWarnings(
               readr:: read_delim(text.file,
                                  delim = "\t", escape_double = FALSE,
                                  col_names = FALSE, trim_ws = TRUE)))
  
  # Collect information about which combinations of informations (pattern) is
  # available for each read (= each row). It achieves it by collecting the 2
  # letter code for each informaiton in a new "pattern" column. This is true
  # for all the column after the number 11, as the first 11 columns are always
  # the same
  
  raw.table["pattern"] <- 
    # Find which information each observation contains and in which order
    apply(raw.table[12:ncol(raw.table)], 2, 
          function(x) gsub("(^..)(.+$)", "\\1", x)) %>%
    # Collapse the pattern of information and append it to each read as a new
    # variable
    apply(1, function(x) paste(x, collapse = " "))
  
  # Extract the possible pattern combinations
  pattern_list <- unique(raw.table$pattern)
  
  # Create a list of all the existing information (abbreviation) in the table,
  # without NAs
  full_pattern <- 
    # Collapse the pattern list to a single string
    paste(pattern_list, collapse = " ") %>%
    # Split the single string to a list of n elements
    strsplit(" ") %>%
    # Transform the list to a vector
    unlist() %>% 
    # Keep only one cpy of each abbreviation
    unique()
  
  full_pattern <- unique(grep("NA", full_pattern, value = TRUE, invert = TRUE))
  
  # Define column names of final table
  
  # Common names are the same for all BAM files ( the first 11 columns are fixed)
  common.names <- c("QName", "FLAG", "Ref_name", "Left_pos", 
                    "Map_Quality", "CIGAR", "Ref_name_mate", 
                    "Pos_name_read", "Temp_length", "Sequence", "Read_Quality")
  # The column names at the end will be the union of common variables and the full pattern of variables
  full_names <- c(common.names, full_pattern)
  
  # Split the raw table into a list of tables that share the same patern of values
  # i.e., will have the same number of columns with non-NA values
  pattern.divided.table <- raw.table %>%
    group_by(pattern) %>% 
    group_split()
  
  # Create a function that:
  # 1. Identifies which variables are present in each pattern group
  single.table.adaptor <- function(df){
    
    # Assign column names for this table - the column order is unchanged since 
    # the pattern was created - thus the order in the pattern is the same as the
    # order of the variables contained in the columns
    specific.names <- strsplit(df$pattern[1], " ") %>% unlist()
    tab.names <- c(common.names, specific.names)
    colnames(df) <- tab.names
    
    # Remove columns named NA
    df <- df[!colnames(df) %in% ("NA")]
    
    # Substitute with the real column names that are in the full pattern, but are
    # not in the subtable yet
    missing <- setdiff(full_names, colnames(df))
    df[missing] <- NA
    
    # Remove the prefix (the pattern that asigned the information) from the cells
    prefix.remover <- function(complex.string){
      
      # Remove everything before the second ":"
      sub("^..:.:", "", complex.string)
      
    }
    
    # Apply the prefix remover to columns 12 to end (the ones with prefix).
    # However, the function's result is weird, if only one row is available,
    # therefore the two different conditions
    if(nrow(df) > 1){
      
      df[12:ncol(df)] <-  apply(df[12:ncol(df)], 2, prefix.remover)
      
    } else {
      
      df[1,12:ncol(df)] <-  lapply(df[1,12:ncol(df)], prefix.remover)
      
    }
    
    return(df)
    
  }
  
  pattern.divided.table <- lapply(pattern.divided.table, single.table.adaptor)
  
  
  
  # Merge the pattern divided tables into one table
  output <- Reduce(function(...) merge(...,all=TRUE), pattern.divided.table)
  
  ## Reorder columns to standard sequence
  col_order <- c(common.names, full_pattern)
  output <- output[,col_order]
  
  ## Replace eventual "-" with NA
  output[output == "-"] <- "NA"
  
  ## Print output table
  return(output)
  
}

bam.converter.10x("banalyzer/data_raw/data/cellranger_big.txt")

Different BAM formats, according to pipeline used

https://gsfrattari.github.io/banalyzer/index.html

Build an example data set

######################################################################
## Title        : Example data set preparation 
##                Takes a raw tsv file, changes sensible data in it, transforms
##                it back to tsv file and saves it in inst/extdata
##
## Author       : Giacomo Schmidt Frattari
##
## Version      : 0.0.1
##
## Date         : 25-08-2023
######################################################################
##
## Prerequisites:#####################################################
##
## Loading required libraries:########################################
library(readr)
library(dplyr)

original.cellranger <- suppressMessages(suppressWarnings(
  readr:: read_delim("banalyzer/inst/data_raw/datacellranger_big.txt",
                     delim = "\t", escape_double = FALSE,
                     col_names = FALSE, trim_ws = TRUE)))

colnames(original.cellranger) <- paste0("col_", c(1:ncol(original.cellranger)))

# Changing sensible data
# Gene alignment
original.cellranger$col_3 <- sample(rep(c("gene_a", "gene_b", "gene_c"), 50000), nrow(original.cellranger))

# Change column 7 to * and 8 and 9 to 0 to be sure
original.cellranger$col_7 <- "*"
original.cellranger$col_8 <- 0
original.cellranger$col_9 <- 0

# Generate a rondom sequence for column 10
nucl <- c("A","G","T","C")
original.cellranger$col_10 <- replicate(nrow(original.cellranger), 
                                        paste0(sample(nucl, 90, replace = TRUE), 
                                               collapse=""))

# Change column 16
original.cellranger$col_16 <- gsub("_bh_.._...:", 
                                   "_bh_test_test:", 
                                   original.cellranger$col_16)

original.cellranger$col_17 <- gsub("(TX:Z:[[:alpha:]]+),", 
                                   paste0("TX:Z:a_gene,"), 
                                   original.cellranger$col_17)

original.cellranger$col_18 <- gsub("(GX:Z:[[:alpha:]]+)", 
                                   paste0("GX:Z:a_gene"), 
                                   original.cellranger$col_18)

original.cellranger$col_18 <- gsub("(AN:Z:[[:alpha:]]+)", 
                                   paste0("AN:Z:a_gene"), 
                                   original.cellranger$col_18)

original.cellranger$col_19 <- gsub("(GN:Z:[[:alpha:]]+)", 
                                   paste0("GN:Z:a_gene"), 
                                   original.cellranger$col_19)

original.cellranger$col_20 <- gsub("(fx:Z:[[:alpha:]]+)", 
                                   paste0("fx:Z:a_gene"), 
                                   original.cellranger$col_20)



write_tsv(original.cellranger,
          file = "inst/extdata/cellrangerdata.txt",
          na = "",
          col_names = F)

Run example

bam.converter.10x("banalyzer/inst/extdata/cellrangerdata.txt")

Visualize alignment with splice site

#' Alignment plotter - splice sites
#' 
#' Plots alignments with gaps to visualize splice sites
#'
#' @param rbam.dataframe A data frame generated from a tsv bam file with `bam.converter`
#'
#' @return A ggplot with visualization of the CIGAR string
#' @export
#' @import ggplot2
#'
#' @examples alignment.plotter.splice(bam.converter.10x(system.file("extdata/cellrangerdata.txt", package = "banalyzer")))

alignment.plotter.splice <- function(rbam.dataframe){
  
  splice.tab <- rbam.dataframe %>% select(QName, UB, CIGAR, Left_pos)
  
  #Identify reads with gaps, and if so, which reads
  splice.tab <- splice.tab %>% 
    mutate (gap.length= ifelse(grepl("N", CIGAR) == TRUE, gsub("(.+M)([[:digit:]]+)(N.+)", "\\2", CIGAR ), 0))
  splice.tab$gap.length <- as.numeric(splice.tab$gap.length)
  
  
  splice.tab <- splice.tab %>% 
    mutate (first.match = (stringr::str_extract(CIGAR,"[[:digit:]]+M")))
  splice.tab <- splice.tab %>% mutate (l.first.match= sub("M", "", first.match))
  splice.tab <- splice.tab %>% mutate (right_pos = Left_pos + as.numeric(l.first.match))
  
  donor.tab <- splice.tab %>% filter(grepl("N", CIGAR) == T)
  
  #Add column to define plot order based on length
  donor.tab <- donor.tab %>% mutate(plot.pos = paste(l.first.match, QName, sep= "_"))
  plot.order <-  donor.tab %>% arrange(plot.pos) %>% pull(plot.pos)
  
  donor.tab %>%
    tidyr::pivot_longer(cols = c(Left_pos, right_pos), names_to = "place", values_to = "nt.pos") %>%
    ggplot(aes(x= nt.pos, y= factor(plot.pos, levels=plot.order), group=plot.pos)) +
    geom_point() +
    geom_line() +
    theme_classic() +
    theme(axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          axis.line.y = element_blank())
}

bam.converter.10x("banalyzer/inst/extdata/cellrangerdata.txt") %>% 
  alignment.plotter.splice()

Next version: zoomer function

bam.converter.10x("banalyzer/inst/extdata/cellrangerdata.txt") %>% 
  alignment.plotter.splice() + coord_cartesian(c(0,500))