Skip to content

Commit

Permalink
Merge pull request #276 from immunomind/dev
Browse files Browse the repository at this point in the history
Immunarch 0.7.0 release
  • Loading branch information
Aleksandr Popov authored Aug 11, 2022
2 parents 2fdb2d0 + afbd98d commit 7a70786
Show file tree
Hide file tree
Showing 47 changed files with 1,791 additions and 378 deletions.
9 changes: 5 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
Package: immunarch
Type: Package
Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
Version: 0.6.9
Version: 0.7.0
Authors@R: c(
person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
person("Vasily O.", "Tsvetkov", , role = "aut"),
person("Eugene", "Rumynskiy", , role = "aut"),
person("Aleksandr A.", "Popov", , role = "aut"),
person("Ivan", "Balashov", , role = "aut"),
person("Maria", "Volobueva", , role = "aut"),
person("Maria", "Samokhina", , role = "aut"),
person("Anna", "Lorenc", , role = "ctb"),
person("Daniel J.", "Moore", , role = "ctb"),
person("Victor", "Greiff", , role = "ctb"),
Expand All @@ -18,7 +18,7 @@ Contact: [email protected]
Description: A comprehensive framework for bioinformatics exploratory analysis of bulk and single-cell
T-cell receptor and antibody repertoires. It provides seamless data loading, analysis and
visualisation for AIRR (Adaptive Immune Receptor Repertoire) data, both bulk immunosequencing (RepSeq)
and single-cell sequencing (scRNAseq). It implements most of the widely used AIRR analysis methods,
and single-cell sequencing (scRNAseq). Immunarch implements most of the widely used AIRR analysis methods,
such as: clonality analysis, estimation of repertoire similarities in distribution of clonotypes
and gene segments, repertoire diversity analysis, annotation of clonotypes using external immune receptor
databases and clonotype tracking in vaccination and cancer studies. A successor to our
Expand Down Expand Up @@ -65,7 +65,8 @@ Imports:
glue,
phangorn,
uuid,
stringi
stringi,
ggraph
Depends:
R (>= 4.0.0),
ggplot2 (>= 3.1.0),
Expand Down
20 changes: 19 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Generated by roxygen2: do not edit by hand

S3method(vis,clonal_family)
S3method(vis,clonal_family_tree)
S3method(vis,immunr_chao1)
S3method(vis,immunr_clonal_prop)
S3method(vis,immunr_dbscan)
Expand Down Expand Up @@ -35,6 +37,7 @@ S3method(vis,immunr_spectr)
S3method(vis,immunr_spectr_nogene)
S3method(vis,immunr_top_prop)
S3method(vis,immunr_tsne)
S3method(vis,step_failure_ignored)
export(apply_asymm)
export(apply_symm)
export(bunch_translate)
Expand Down Expand Up @@ -87,6 +90,7 @@ export(repOverlap)
export(repOverlapAnalysis)
export(repSample)
export(repSave)
export(repSomaticHypermutation)
export(select_barcodes)
export(select_clusters)
export(seqCluster)
Expand All @@ -111,7 +115,7 @@ importFrom(Rcpp,sourceCpp)
importFrom(UpSetR,fromExpression)
importFrom(UpSetR,upset)
importFrom(ape,as.DNAbin)
importFrom(ape,muscle)
importFrom(ape,clustal)
importFrom(ape,read.tree)
importFrom(circlize,chordDiagram)
importFrom(data.table,":=")
Expand All @@ -125,6 +129,7 @@ importFrom(data.table,setDT)
importFrom(data.table,setcolorder)
importFrom(data.table,setnames)
importFrom(doParallel,registerDoParallel)
importFrom(doParallel,stopImplicitCluster)
importFrom(dplyr,arrange)
importFrom(dplyr,as_tibble)
importFrom(dplyr,collect)
Expand Down Expand Up @@ -166,6 +171,10 @@ importFrom(ggpubr,ggscatter)
importFrom(ggpubr,rotate_x_text)
importFrom(ggpubr,stat_compare_means)
importFrom(ggpubr,theme_pubr)
importFrom(ggraph,geom_edge_diagonal)
importFrom(ggraph,geom_node_point)
importFrom(ggraph,ggraph)
importFrom(ggraph,theme_graph)
importFrom(ggseqlogo,geom_logo)
importFrom(ggseqlogo,theme_logo)
importFrom(glue,glue)
Expand All @@ -181,13 +190,18 @@ importFrom(magrittr,"%>%")
importFrom(magrittr,extract2)
importFrom(magrittr,set_attr)
importFrom(methods,as)
importFrom(parallel,clusterExport)
importFrom(parallel,detectCores)
importFrom(parallel,makeCluster)
importFrom(parallel,mclapply)
importFrom(parallel,parApply)
importFrom(parallel,stopCluster)
importFrom(patchwork,plot_annotation)
importFrom(patchwork,wrap_plots)
importFrom(phangorn,write.phyDat)
importFrom(pheatmap,pheatmap)
importFrom(plyr,.)
importFrom(plyr,adply)
importFrom(plyr,dlply)
importFrom(plyr,mapvalues)
importFrom(purrr,imap)
Expand Down Expand Up @@ -256,8 +270,10 @@ importFrom(stringdist,stringdistmatrix)
importFrom(stringi,stri_replace_all_fixed)
importFrom(stringr,boundary)
importFrom(stringr,fixed)
importFrom(stringr,str_c)
importFrom(stringr,str_count)
importFrom(stringr,str_detect)
importFrom(stringr,str_extract)
importFrom(stringr,str_extract_all)
importFrom(stringr,str_length)
importFrom(stringr,str_match)
Expand All @@ -270,7 +286,9 @@ importFrom(stringr,str_sub)
importFrom(stringr,str_trim)
importFrom(tibble,rownames_to_column)
importFrom(tibble,tibble)
importFrom(tidyr,drop_na)
importFrom(tidyr,unite)
importFrom(tidyr,unnest)
importFrom(tidyselect,starts_with)
importFrom(utils,capture.output)
importFrom(utils,packageVersion)
Expand Down
123 changes: 74 additions & 49 deletions R/align_lineage.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
#' This function aligns all sequences (incliding germline) that belong to one clonal lineage and one cluster.
#' After clustering and building the clonal lineage and germline, the next step is to analyze the degree of mutation
#' and maturity of each clonal lineage. This allows for finding high mature cells and cells with a large
#' number of offspring. The phylogenetic analysis will find mutations that increase the affinity of BCR.
#' Making alignment of the sequence is the first step towards sequence analysis including BCR.
#' Aligns all sequences incliding germline within each clonal lineage within each cluster
#'
#' @concept align_lineage
#'
Expand All @@ -14,11 +10,16 @@
#' @importFrom purrr map_dfr
#' @importFrom rlist list.remove
#' @importFrom utils str
#' @importFrom ape as.DNAbin muscle
#' @importFrom doParallel registerDoParallel
#' @importFrom ape as.DNAbin clustal
#' @importFrom doParallel registerDoParallel stopImplicitCluster
#' @importFrom parallel mclapply

#' @description Aligns all sequences incliding germline within each clonal lineage within each cluster
#' @description This function aligns all sequences (incliding germline) that belong to one clonal
#' lineage and one cluster. After clustering and building the clonal lineage and germline, the next
#' step is to analyze the degree of mutation and maturity of each clonal lineage. This allows for
#' finding high mature cells and cells with a large number of offspring. The phylogenetic analysis
#' will find mutations that increase the affinity of BCR. Making alignment of the sequence
#' is the first step towards sequence analysis including BCR.
#'
#' @usage
#'
Expand All @@ -39,13 +40,13 @@
#' @param .align_threads Number of threads for lineage alignment.
#'
#' It must have columns in the immunarch compatible format \link{immunarch_data_format}, and also
#' must contain 'Cluster' column, which is added by seqCluster() function, and 'Sequence.germline'
#' must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence'
#' column, which is added by repGermline() function.
#'
#' @param .verbose_output If TRUE, all output dataframe columns will be included (see documentation about this
#' function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly
#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() calculation
#' will be included in the output.
#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and
#' repSomaticHypermutation() calculation will be included in the output.
#'
#' @param .nofail Will return NA instead of stopping if Clustal W is not installed.
#' Used to avoid raising errors in examples on computers where Clustal W is not installed.
Expand All @@ -56,16 +57,21 @@
#' The dataframe has these columns:
#' * Cluster: cluster name
#' * Germline: germline sequence
#' * V.germline.nt: germline V gene sequence
#' * J.germline.nt: germline J gene sequence
#' * CDR3.germline.length: length of CDR3 in germline
#' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage
#' (.min_lineage_sequences is below the threshold); TRUE if it was aligned
#' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE)
#' * V.length (included if .verbose_output=TRUE): shortest length of V gene part outside of CDR3 region in this
#' * V.length: shortest length of V gene part outside of CDR3 region in this
#' group of sequences; longer V genes (including germline) are trimmed to this length before alignment
#' * J.length (included if .verbose_output=TRUE): shortest length of J gene part outside of CDR3 region in this
#' * J.length: shortest length of J gene part outside of CDR3 region in this
#' group of sequences; longer J genes (including germline) are trimmed to this length before alignment
#' * Sequences (included if .verbose_output=TRUE): nested dataframe containing all sequences for this combination
#' * Sequences: nested dataframe containing all sequences for this combination
#' of cluster and germline; it has columns
#' Sequence, V.end, J.start, CDR3.start, CDR3.end; all values taken from the input dataframe
#' Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt
#' and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end;
#' all values taken from the input dataframe
#'
#' @examples
#'
Expand All @@ -74,7 +80,7 @@
#'
#' bcr_data %>%
#' seqCluster(seqDist(bcr_data), .fixed_threshold = 3) %>%
#' repGermline() %>%
#' repGermline(.threads = 1) %>%
#' repAlignLineage(.min_lineage_sequences = 2, .align_threads = 2, .nofail = TRUE)
#' @export repAlignLineage
repAlignLineage <- function(.data,
Expand All @@ -88,22 +94,30 @@ repAlignLineage <- function(.data,
"Please download it from here: http://www.clustal.org/download/current/\n",
"or install it with your system package manager (such as apt or dnf)."
), .nofail)) {
return(NA)
return(get_empty_object_with_class("step_failure_ignored"))
}

doParallel::registerDoParallel(cores = .prepare_threads)
.data %<>%
apply_to_sample_or_list(
align_single_df,
.min_lineage_sequences = .min_lineage_sequences,
.parallel_prepare = .prepare_threads > 1,
.align_threads = .align_threads,
.verbose_output = .verbose_output
)
doParallel::stopImplicitCluster()
return(.data)
}

align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbose_output) {
for (required_column in c("Cluster", "Germline.sequence")) {
align_single_df <- function(data,
.min_lineage_sequences,
.parallel_prepare,
.align_threads,
.verbose_output) {
for (required_column in c(
"Cluster", "Germline.sequence", "V.germline.nt", "J.germline.nt", "CDR3.germline.length"
)) {
if (!(required_column %in% colnames(data))) {
stop(
"Found dataframe without required column ",
Expand All @@ -120,7 +134,7 @@ align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbo
.fun = prepare_results_row,
.min_lineage_sequences = .min_lineage_sequences,
.verbose_output = .verbose_output,
.parallel = TRUE
.parallel = .parallel_prepare
) %>%
`[`(!is.na(.)) %>%
unname()
Expand All @@ -142,14 +156,17 @@ align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbo
mc.cores = .align_threads
)

return(convert_results_to_df(results, alignments, .verbose_output))
return(convert_results_to_df(results, alignments))
}

# this function accepts dataframe subset containing rows only for current lineage
# and returns named list containing 1 row for results dataframe
prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose_output) {
cluster_name <- lineage_subset[[1, "Cluster"]]
germline_seq <- lineage_subset[[1, "Germline.sequence"]]
germline_v <- lineage_subset[[1, "V.germline.nt"]]
germline_j <- lineage_subset[[1, "J.germline.nt"]]
germline_cdr3_len <- lineage_subset[[1, "CDR3.germline.length"]]
aligned <- nrow(lineage_subset) >= .min_lineage_sequences

if (!aligned & !.verbose_output) {
Expand All @@ -163,13 +180,19 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
lineage_subset[["Sequence"]], lineage_subset[["J.start"]], lineage_subset[["CDR3.end"]]
)

sequences_columns <- c(
"Sequence", "Clone.ID", "Clones",
"CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt"
)
if (.verbose_output) {
sequences <- lineage_subset[c("Sequence", "V.end", "J.start", "CDR3.start", "CDR3.end")]
sequences_columns %<>% c("V.end", "J.start", "CDR3.start", "CDR3.end")
}
sequences <- lineage_subset[sequences_columns]
sequences[["Clone.ID"]] %<>% as.integer()
sequences[["Clones"]] %<>% as.integer()

germline_parts <- strsplit(germline_seq, "N")[[1]]
germline_v_len <- stringr::str_length(germline_parts[1])
germline_j_len <- stringr::str_length(tail(germline_parts, 1))
germline_v_len <- str_length(germline_v)
germline_j_len <- str_length(germline_j)
v_min_len <- min(lineage_subset[["V.lengths"]], germline_v_len)
j_min_len <- min(lineage_subset[["J.lengths"]], germline_j_len)

Expand All @@ -181,12 +204,21 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
lineage_subset[["J.lengths"]],
j_min_len
)
alignment <- convert_to_dnabin(germline_trimmed, clonotypes_trimmed)

clonotypes_names <- sapply(lineage_subset[["Clone.ID"]], function(id) {
paste0("ID_", id)
})
all_sequences_list <- c(list(germline_trimmed), as.list(clonotypes_trimmed))
names(all_sequences_list) <- c("Germline", clonotypes_names)
alignment <- convert_seq_list_to_dnabin(all_sequences_list)

if (.verbose_output) {
return(list(
Cluster = cluster_name,
Germline = germline_seq,
V.germline.nt = germline_v,
J.germline.nt = germline_j,
CDR3.germline.length = germline_cdr3_len,
Aligned = aligned,
Alignment = alignment,
V.length = v_min_len,
Expand All @@ -197,43 +229,36 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
return(list(
Cluster = cluster_name,
Germline = germline_seq,
Alignment = alignment
V.germline.nt = germline_v,
J.germline.nt = germline_j,
CDR3.germline.length = germline_cdr3_len,
Alignment = alignment,
V.length = v_min_len,
J.length = j_min_len,
Sequences = sequences
))
}
}

convert_to_dnabin <- function(germline_seq, clonotypes) {
all_sequences_list <- c(list(germline = germline_seq), as.list(clonotypes))
dnabin <- all_sequences_list %>%
lapply(
function(sequence) {
sequence %>%
stringr::str_extract_all(stringr::boundary("character")) %>%
unlist()
}
) %>%
ape::as.DNAbin()
return(dnabin)
}

# trim V/J tails in sequence to the specified lenghts v_min, j_min
trim_seq <- function(seq, v_len, v_min, j_len, j_min) {
stringr::str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
}

convert_results_to_df <- function(nested_results_list, nested_alignments_list, .verbose_output) {
convert_results_to_df <- function(nested_results_list, nested_alignments_list) {
alignments <- nested_alignments_list %>%
lapply(magrittr::extract2, "Alignment") %>%
tibble(Alignment = .)
sequences <- nested_results_list %>%
lapply(magrittr::extract2, "Sequences") %>%
tibble(Sequences = .)
df <- nested_results_list %>%
lapply(rlist::list.remove, c("Alignment", "Sequences")) %>%
purrr::map_dfr(~.) %>%
cbind(alignments)
if (.verbose_output) {
sequences <- nested_results_list %>%
lapply(magrittr::extract2, "Sequences") %>%
tibble(Sequences = .)
df %<>% cbind(sequences)
cbind(alignments, sequences)
# fix column types after dataframe rebuilding
for (column in c("CDR3.germline.length", "V.length", "J.length")) {
df[[column]] %<>% as.integer()
}
return(df)
}
Expand Down
2 changes: 2 additions & 0 deletions R/distance.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#' Function for computing distance for sequences
#'
#' @concept distance
#'
#' @importFrom stringdist stringdistmatrix
#' @importFrom purrr map pmap map2
#' @importFrom magrittr %>% %<>% set_attr
Expand Down
Loading

0 comments on commit 7a70786

Please sign in to comment.