BAM files
Standard BAM file
Exported as tsv, but when imported in R
readr:: read_delim( "banalyzer/data_raw/data/cellranger_big.txt",
delim = "\t", escape_double = FALSE,
col_names = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 3698 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): X1, X3, X6, X7, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, ...
## dbl (5): X2, X4, X5, X8, X9
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Bam converter 10x function
#' 10x Bam converter
#' takes a tsv file from a cellranger bam file as input and returns a data frame.
#'
#' @param text.file File address of a cellranger bam file transformed in tsv format.
#'
#' @return A data frame with single reads as rows and bam file parameters as columns
#' @export
#' @import dplyr
#'
#' @examples bam.converter.10x(system.file("extdata/cellrangerdata.txt", package = "banalyzer"))
bam.converter.10x <- function(text.file){
# Read bam file as tab delimited file in. Suppresses warnings and messages.
raw.table <- suppressMessages(suppressWarnings(
readr:: read_delim(text.file,
delim = "\t", escape_double = FALSE,
col_names = FALSE, trim_ws = TRUE)))
# Collect information about which combinations of informations (pattern) is
# available for each read (= each row). It achieves it by collecting the 2
# letter code for each informaiton in a new "pattern" column. This is true
# for all the column after the number 11, as the first 11 columns are always
# the same
raw.table["pattern"] <-
# Find which information each observation contains and in which order
apply(raw.table[12:ncol(raw.table)], 2,
function(x) gsub("(^..)(.+$)", "\\1", x)) %>%
# Collapse the pattern of information and append it to each read as a new
# variable
apply(1, function(x) paste(x, collapse = " "))
# Extract the possible pattern combinations
pattern_list <- unique(raw.table$pattern)
# Create a list of all the existing information (abbreviation) in the table,
# without NAs
full_pattern <-
# Collapse the pattern list to a single string
paste(pattern_list, collapse = " ") %>%
# Split the single string to a list of n elements
strsplit(" ") %>%
# Transform the list to a vector
unlist() %>%
# Keep only one cpy of each abbreviation
unique()
full_pattern <- unique(grep("NA", full_pattern, value = TRUE, invert = TRUE))
# Define column names of final table
# Common names are the same for all BAM files ( the first 11 columns are fixed)
common.names <- c("QName", "FLAG", "Ref_name", "Left_pos",
"Map_Quality", "CIGAR", "Ref_name_mate",
"Pos_name_read", "Temp_length", "Sequence", "Read_Quality")
# The column names at the end will be the union of common variables and the full pattern of variables
full_names <- c(common.names, full_pattern)
# Split the raw table into a list of tables that share the same patern of values
# i.e., will have the same number of columns with non-NA values
pattern.divided.table <- raw.table %>%
group_by(pattern) %>%
group_split()
# Create a function that:
# 1. Identifies which variables are present in each pattern group
single.table.adaptor <- function(df){
# Assign column names for this table - the column order is unchanged since
# the pattern was created - thus the order in the pattern is the same as the
# order of the variables contained in the columns
specific.names <- strsplit(df$pattern[1], " ") %>% unlist()
tab.names <- c(common.names, specific.names)
colnames(df) <- tab.names
# Remove columns named NA
df <- df[!colnames(df) %in% ("NA")]
# Substitute with the real column names that are in the full pattern, but are
# not in the subtable yet
missing <- setdiff(full_names, colnames(df))
df[missing] <- NA
# Remove the prefix (the pattern that asigned the information) from the cells
prefix.remover <- function(complex.string){
# Remove everything before the second ":"
sub("^..:.:", "", complex.string)
}
# Apply the prefix remover to columns 12 to end (the ones with prefix).
# However, the function's result is weird, if only one row is available,
# therefore the two different conditions
if(nrow(df) > 1){
df[12:ncol(df)] <- apply(df[12:ncol(df)], 2, prefix.remover)
} else {
df[1,12:ncol(df)] <- lapply(df[1,12:ncol(df)], prefix.remover)
}
return(df)
}
pattern.divided.table <- lapply(pattern.divided.table, single.table.adaptor)
# Merge the pattern divided tables into one table
output <- Reduce(function(...) merge(...,all=TRUE), pattern.divided.table)
## Reorder columns to standard sequence
col_order <- c(common.names, full_pattern)
output <- output[,col_order]
## Replace eventual "-" with NA
output[output == "-"] <- "NA"
## Print output table
return(output)
}
bam.converter.10x("banalyzer/data_raw/data/cellranger_big.txt")