BAM files
Standard BAM file
Exported as tsv, but when imported in R
readr:: read_delim( "banalyzer/data_raw/data/cellranger_big.txt",
delim = "\t", escape_double = FALSE,
col_names = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 3698 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): X1, X3, X6, X7, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, ...
## dbl (5): X2, X4, X5, X8, X9
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Bam converter 10x function
#' 10x Bam converter
#' takes a tsv file from a cellranger bam file as input and returns a data frame.
#'
#' @param text.file File address of a cellranger bam file transformed in tsv format.
#'
#' @return A data frame with single reads as rows and bam file parameters as columns
#' @export
#' @import dplyr
#'
#' @examples bam.converter.10x(system.file("extdata/cellrangerdata.txt", package = "banalyzer"))
bam.converter.10x <- function(text.file){
# Read bam file as tab delimited file in. Suppresses warnings and messages.
raw.table <- suppressMessages(suppressWarnings(
readr:: read_delim(text.file,
delim = "\t", escape_double = FALSE,
col_names = FALSE, trim_ws = TRUE)))
# Collect information about which combinations of informations (pattern) is
# available for each read (= each row). It achieves it by collecting the 2
# letter code for each informaiton in a new "pattern" column. This is true
# for all the column after the number 11, as the first 11 columns are always
# the same
raw.table["pattern"] <-
# Find which information each observation contains and in which order
apply(raw.table[12:ncol(raw.table)], 2,
function(x) gsub("(^..)(.+$)", "\\1", x)) %>%
# Collapse the pattern of information and append it to each read as a new
# variable
apply(1, function(x) paste(x, collapse = " "))
# Extract the possible pattern combinations
pattern_list <- unique(raw.table$pattern)
# Create a list of all the existing information (abbreviation) in the table,
# without NAs
full_pattern <-
# Collapse the pattern list to a single string
paste(pattern_list, collapse = " ") %>%
# Split the single string to a list of n elements
strsplit(" ") %>%
# Transform the list to a vector
unlist() %>%
# Keep only one cpy of each abbreviation
unique()
full_pattern <- unique(grep("NA", full_pattern, value = TRUE, invert = TRUE))
# Define column names of final table
# Common names are the same for all BAM files ( the first 11 columns are fixed)
common.names <- c("QName", "FLAG", "Ref_name", "Left_pos",
"Map_Quality", "CIGAR", "Ref_name_mate",
"Pos_name_read", "Temp_length", "Sequence", "Read_Quality")
# The column names at the end will be the union of common variables and the full pattern of variables
full_names <- c(common.names, full_pattern)
# Split the raw table into a list of tables that share the same patern of values
# i.e., will have the same number of columns with non-NA values
pattern.divided.table <- raw.table %>%
group_by(pattern) %>%
group_split()
# Create a function that:
# 1. Identifies which variables are present in each pattern group
single.table.adaptor <- function(df){
# Assign column names for this table - the column order is unchanged since
# the pattern was created - thus the order in the pattern is the same as the
# order of the variables contained in the columns
specific.names <- strsplit(df$pattern[1], " ") %>% unlist()
tab.names <- c(common.names, specific.names)
colnames(df) <- tab.names
# Remove columns named NA
df <- df[!colnames(df) %in% ("NA")]
# Substitute with the real column names that are in the full pattern, but are
# not in the subtable yet
missing <- setdiff(full_names, colnames(df))
df[missing] <- NA
# Remove the prefix (the pattern that asigned the information) from the cells
prefix.remover <- function(complex.string){
# Remove everything before the second ":"
sub("^..:.:", "", complex.string)
}
# Apply the prefix remover to columns 12 to end (the ones with prefix).
# However, the function's result is weird, if only one row is available,
# therefore the two different conditions
if(nrow(df) > 1){
df[12:ncol(df)] <- apply(df[12:ncol(df)], 2, prefix.remover)
} else {
df[1,12:ncol(df)] <- lapply(df[1,12:ncol(df)], prefix.remover)
}
return(df)
}
pattern.divided.table <- lapply(pattern.divided.table, single.table.adaptor)
# Merge the pattern divided tables into one table
output <- Reduce(function(...) merge(...,all=TRUE), pattern.divided.table)
## Reorder columns to standard sequence
col_order <- c(common.names, full_pattern)
output <- output[,col_order]
## Replace eventual "-" with NA
output[output == "-"] <- "NA"
## Print output table
return(output)
}
bam.converter.10x("banalyzer/data_raw/data/cellranger_big.txt")
Build an example data set
######################################################################
## Title : Example data set preparation
## Takes a raw tsv file, changes sensible data in it, transforms
## it back to tsv file and saves it in inst/extdata
##
## Author : Giacomo Schmidt Frattari
##
## Version : 0.0.1
##
## Date : 25-08-2023
######################################################################
##
## Prerequisites:#####################################################
##
## Loading required libraries:########################################
library(readr)
library(dplyr)
original.cellranger <- suppressMessages(suppressWarnings(
readr:: read_delim("banalyzer/inst/data_raw/datacellranger_big.txt",
delim = "\t", escape_double = FALSE,
col_names = FALSE, trim_ws = TRUE)))
colnames(original.cellranger) <- paste0("col_", c(1:ncol(original.cellranger)))
# Changing sensible data
# Gene alignment
original.cellranger$col_3 <- sample(rep(c("gene_a", "gene_b", "gene_c"), 50000), nrow(original.cellranger))
# Change column 7 to * and 8 and 9 to 0 to be sure
original.cellranger$col_7 <- "*"
original.cellranger$col_8 <- 0
original.cellranger$col_9 <- 0
# Generate a rondom sequence for column 10
nucl <- c("A","G","T","C")
original.cellranger$col_10 <- replicate(nrow(original.cellranger),
paste0(sample(nucl, 90, replace = TRUE),
collapse=""))
# Change column 16
original.cellranger$col_16 <- gsub("_bh_.._...:",
"_bh_test_test:",
original.cellranger$col_16)
original.cellranger$col_17 <- gsub("(TX:Z:[[:alpha:]]+),",
paste0("TX:Z:a_gene,"),
original.cellranger$col_17)
original.cellranger$col_18 <- gsub("(GX:Z:[[:alpha:]]+)",
paste0("GX:Z:a_gene"),
original.cellranger$col_18)
original.cellranger$col_18 <- gsub("(AN:Z:[[:alpha:]]+)",
paste0("AN:Z:a_gene"),
original.cellranger$col_18)
original.cellranger$col_19 <- gsub("(GN:Z:[[:alpha:]]+)",
paste0("GN:Z:a_gene"),
original.cellranger$col_19)
original.cellranger$col_20 <- gsub("(fx:Z:[[:alpha:]]+)",
paste0("fx:Z:a_gene"),
original.cellranger$col_20)
write_tsv(original.cellranger,
file = "inst/extdata/cellrangerdata.txt",
na = "",
col_names = F)
Run example
bam.converter.10x("banalyzer/inst/extdata/cellrangerdata.txt")
Visualize alignment with splice site
#' Alignment plotter - splice sites
#'
#' Plots alignments with gaps to visualize splice sites
#'
#' @param rbam.dataframe A data frame generated from a tsv bam file with `bam.converter`
#'
#' @return A ggplot with visualization of the CIGAR string
#' @export
#' @import ggplot2
#'
#' @examples alignment.plotter.splice(bam.converter.10x(system.file("extdata/cellrangerdata.txt", package = "banalyzer")))
alignment.plotter.splice <- function(rbam.dataframe){
splice.tab <- rbam.dataframe %>% select(QName, UB, CIGAR, Left_pos)
#Identify reads with gaps, and if so, which reads
splice.tab <- splice.tab %>%
mutate (gap.length= ifelse(grepl("N", CIGAR) == TRUE, gsub("(.+M)([[:digit:]]+)(N.+)", "\\2", CIGAR ), 0))
splice.tab$gap.length <- as.numeric(splice.tab$gap.length)
splice.tab <- splice.tab %>%
mutate (first.match = (stringr::str_extract(CIGAR,"[[:digit:]]+M")))
splice.tab <- splice.tab %>% mutate (l.first.match= sub("M", "", first.match))
splice.tab <- splice.tab %>% mutate (right_pos = Left_pos + as.numeric(l.first.match))
donor.tab <- splice.tab %>% filter(grepl("N", CIGAR) == T)
#Add column to define plot order based on length
donor.tab <- donor.tab %>% mutate(plot.pos = paste(l.first.match, QName, sep= "_"))
plot.order <- donor.tab %>% arrange(plot.pos) %>% pull(plot.pos)
donor.tab %>%
tidyr::pivot_longer(cols = c(Left_pos, right_pos), names_to = "place", values_to = "nt.pos") %>%
ggplot(aes(x= nt.pos, y= factor(plot.pos, levels=plot.order), group=plot.pos)) +
geom_point() +
geom_line() +
theme_classic() +
theme(axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.line.y = element_blank())
}
bam.converter.10x("banalyzer/inst/extdata/cellrangerdata.txt") %>%
alignment.plotter.splice()
Next version: zoomer function
bam.converter.10x("banalyzer/inst/extdata/cellrangerdata.txt") %>%
alignment.plotter.splice() + coord_cartesian(c(0,500))