Merge pull request #276 from immunomind/dev

Immunarch 0.7.0 release
immunomind · Aug 11, 2022 · 7a70786 · 7a70786
2 parents 2fdb2d0 + afbd98d
commit 7a70786
Show file tree

Hide file tree

Showing 47 changed files with 1,791 additions and 378 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,14 +1,14 @@
 Package: immunarch
 Type: Package
 Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
-Version: 0.6.9
+Version: 0.7.0
 Authors@R: c(
     person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
     person("Vasily O.", "Tsvetkov", , role = "aut"),
     person("Eugene", "Rumynskiy", , role = "aut"),
     person("Aleksandr A.", "Popov", , role = "aut"),
     person("Ivan", "Balashov", , role = "aut"),
-    person("Maria", "Volobueva", , role = "aut"),
+    person("Maria", "Samokhina", , role = "aut"),
     person("Anna", "Lorenc", , role = "ctb"),
     person("Daniel J.", "Moore", , role = "ctb"),
     person("Victor", "Greiff", , role = "ctb"),
@@ -18,7 +18,7 @@ Contact: [email protected]
 Description: A comprehensive framework for bioinformatics exploratory analysis of bulk and single-cell
     T-cell receptor and antibody repertoires. It provides seamless data loading, analysis and
     visualisation for AIRR (Adaptive Immune Receptor Repertoire) data, both bulk immunosequencing (RepSeq)
-    and single-cell sequencing (scRNAseq). It implements most of the widely used AIRR analysis methods,
+    and single-cell sequencing (scRNAseq). Immunarch implements most of the widely used AIRR analysis methods,
     such as: clonality analysis, estimation of repertoire similarities in distribution of clonotypes
     and gene segments, repertoire diversity analysis, annotation of clonotypes using external immune receptor
     databases and clonotype tracking in vaccination and cancer studies. A successor to our
@@ -65,7 +65,8 @@ Imports:
     glue,
     phangorn,
     uuid,
-    stringi
+    stringi,
+    ggraph
 Depends:
     R (>= 4.0.0),
     ggplot2 (>= 3.1.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(vis,clonal_family)
+S3method(vis,clonal_family_tree)
 S3method(vis,immunr_chao1)
 S3method(vis,immunr_clonal_prop)
 S3method(vis,immunr_dbscan)
@@ -35,6 +37,7 @@ S3method(vis,immunr_spectr)
 S3method(vis,immunr_spectr_nogene)
 S3method(vis,immunr_top_prop)
 S3method(vis,immunr_tsne)
+S3method(vis,step_failure_ignored)
 export(apply_asymm)
 export(apply_symm)
 export(bunch_translate)
@@ -87,6 +90,7 @@ export(repOverlap)
 export(repOverlapAnalysis)
 export(repSample)
 export(repSave)
+export(repSomaticHypermutation)
 export(select_barcodes)
 export(select_clusters)
 export(seqCluster)
@@ -111,7 +115,7 @@ importFrom(Rcpp,sourceCpp)
 importFrom(UpSetR,fromExpression)
 importFrom(UpSetR,upset)
 importFrom(ape,as.DNAbin)
-importFrom(ape,muscle)
+importFrom(ape,clustal)
 importFrom(ape,read.tree)
 importFrom(circlize,chordDiagram)
 importFrom(data.table,":=")
@@ -125,6 +129,7 @@ importFrom(data.table,setDT)
 importFrom(data.table,setcolorder)
 importFrom(data.table,setnames)
 importFrom(doParallel,registerDoParallel)
+importFrom(doParallel,stopImplicitCluster)
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,collect)
@@ -166,6 +171,10 @@ importFrom(ggpubr,ggscatter)
 importFrom(ggpubr,rotate_x_text)
 importFrom(ggpubr,stat_compare_means)
 importFrom(ggpubr,theme_pubr)
+importFrom(ggraph,geom_edge_diagonal)
+importFrom(ggraph,geom_node_point)
+importFrom(ggraph,ggraph)
+importFrom(ggraph,theme_graph)
 importFrom(ggseqlogo,geom_logo)
 importFrom(ggseqlogo,theme_logo)
 importFrom(glue,glue)
@@ -181,13 +190,18 @@ importFrom(magrittr,"%>%")
 importFrom(magrittr,extract2)
 importFrom(magrittr,set_attr)
 importFrom(methods,as)
+importFrom(parallel,clusterExport)
 importFrom(parallel,detectCores)
+importFrom(parallel,makeCluster)
 importFrom(parallel,mclapply)
+importFrom(parallel,parApply)
+importFrom(parallel,stopCluster)
 importFrom(patchwork,plot_annotation)
 importFrom(patchwork,wrap_plots)
 importFrom(phangorn,write.phyDat)
 importFrom(pheatmap,pheatmap)
 importFrom(plyr,.)
+importFrom(plyr,adply)
 importFrom(plyr,dlply)
 importFrom(plyr,mapvalues)
 importFrom(purrr,imap)
@@ -256,8 +270,10 @@ importFrom(stringdist,stringdistmatrix)
 importFrom(stringi,stri_replace_all_fixed)
 importFrom(stringr,boundary)
 importFrom(stringr,fixed)
+importFrom(stringr,str_c)
 importFrom(stringr,str_count)
 importFrom(stringr,str_detect)
+importFrom(stringr,str_extract)
 importFrom(stringr,str_extract_all)
 importFrom(stringr,str_length)
 importFrom(stringr,str_match)
@@ -270,7 +286,9 @@ importFrom(stringr,str_sub)
 importFrom(stringr,str_trim)
 importFrom(tibble,rownames_to_column)
 importFrom(tibble,tibble)
+importFrom(tidyr,drop_na)
 importFrom(tidyr,unite)
+importFrom(tidyr,unnest)
 importFrom(tidyselect,starts_with)
 importFrom(utils,capture.output)
 importFrom(utils,packageVersion)

diff --git a/R/align_lineage.R b/R/align_lineage.R
@@ -1,8 +1,4 @@
-#' This function aligns all sequences (incliding germline) that belong to one clonal lineage and one cluster.
-#' After clustering and building the clonal lineage and germline, the next step is to analyze the degree of mutation
-#' and maturity of each clonal lineage. This allows for finding high mature cells and cells with a large
-#' number of offspring. The phylogenetic analysis will find mutations that increase the affinity of BCR.
-#' Making alignment of the sequence is the first step towards sequence analysis including BCR.
+#' Aligns all sequences incliding germline within each clonal lineage within each cluster
 #'
 #' @concept align_lineage
 #'
@@ -14,11 +10,16 @@
 #' @importFrom purrr map_dfr
 #' @importFrom rlist list.remove
 #' @importFrom utils str
-#' @importFrom ape as.DNAbin muscle
-#' @importFrom doParallel registerDoParallel
+#' @importFrom ape as.DNAbin clustal
+#' @importFrom doParallel registerDoParallel stopImplicitCluster
 #' @importFrom parallel mclapply
 
-#' @description Aligns all sequences incliding germline within each clonal lineage within each cluster
+#' @description This function aligns all sequences (incliding germline) that belong to one clonal
+#' lineage and one cluster. After clustering and building the clonal lineage and germline, the next
+#' step is to analyze the degree of mutation and maturity of each clonal lineage. This allows for
+#' finding high mature cells and cells with a large number of offspring. The phylogenetic analysis
+#' will find mutations that increase the affinity of BCR. Making alignment of the sequence
+#' is the first step towards sequence analysis including BCR.
 #'
 #' @usage
 #'
@@ -39,13 +40,13 @@
 #' @param .align_threads Number of threads for lineage alignment.
 #'
 #' It must have columns in the immunarch compatible format \link{immunarch_data_format}, and also
-#' must contain 'Cluster' column, which is added by seqCluster() function, and 'Sequence.germline'
+#' must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence'
 #' column, which is added by repGermline() function.
 #'
 #' @param .verbose_output If TRUE, all output dataframe columns will be included (see documentation about this
 #' function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly
-#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() calculation
-#' will be included in the output.
+#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and
+#' repSomaticHypermutation() calculation will be included in the output.
 #'
 #' @param .nofail Will return NA instead of stopping if Clustal W is not installed.
 #' Used to avoid raising errors in examples on computers where Clustal W is not installed.
@@ -56,16 +57,21 @@
 #' The dataframe has these columns:
 #' * Cluster: cluster name
 #' * Germline: germline sequence
+#' * V.germline.nt: germline V gene sequence
+#' * J.germline.nt: germline J gene sequence
+#' * CDR3.germline.length: length of CDR3 in germline
 #' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage
 #'   (.min_lineage_sequences is below the threshold); TRUE if it was aligned
 #' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE)
-#' * V.length (included if .verbose_output=TRUE): shortest length of V gene part outside of CDR3 region in this
+#' * V.length: shortest length of V gene part outside of CDR3 region in this
 #'   group of sequences; longer V genes (including germline) are trimmed to this length before alignment
-#' * J.length (included if .verbose_output=TRUE): shortest length of J gene part outside of CDR3 region in this
+#' * J.length: shortest length of J gene part outside of CDR3 region in this
 #'   group of sequences; longer J genes (including germline) are trimmed to this length before alignment
-#' * Sequences (included if .verbose_output=TRUE): nested dataframe containing all sequences for this combination
+#' * Sequences: nested dataframe containing all sequences for this combination
 #'   of cluster and germline; it has columns
-#'   Sequence, V.end, J.start, CDR3.start, CDR3.end; all values taken from the input dataframe
+#'   Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt
+#'   and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end;
+#'   all values taken from the input dataframe
 #'
 #' @examples
 #'
@@ -74,7 +80,7 @@
 #'
 #' bcr_data %>%
 #'   seqCluster(seqDist(bcr_data), .fixed_threshold = 3) %>%
-#'   repGermline() %>%
+#'   repGermline(.threads = 1) %>%
 #'   repAlignLineage(.min_lineage_sequences = 2, .align_threads = 2, .nofail = TRUE)
 #' @export repAlignLineage
 repAlignLineage <- function(.data,
@@ -88,22 +94,30 @@ repAlignLineage <- function(.data,
     "Please download it from here: http://www.clustal.org/download/current/\n",
     "or install it with your system package manager (such as apt or dnf)."
   ), .nofail)) {
-    return(NA)
+    return(get_empty_object_with_class("step_failure_ignored"))
   }
 
   doParallel::registerDoParallel(cores = .prepare_threads)
   .data %<>%
     apply_to_sample_or_list(
       align_single_df,
       .min_lineage_sequences = .min_lineage_sequences,
+      .parallel_prepare = .prepare_threads > 1,
       .align_threads = .align_threads,
       .verbose_output = .verbose_output
     )
+  doParallel::stopImplicitCluster()
   return(.data)
 }
 
-align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbose_output) {
-  for (required_column in c("Cluster", "Germline.sequence")) {
+align_single_df <- function(data,
+                            .min_lineage_sequences,
+                            .parallel_prepare,
+                            .align_threads,
+                            .verbose_output) {
+  for (required_column in c(
+    "Cluster", "Germline.sequence", "V.germline.nt", "J.germline.nt", "CDR3.germline.length"
+  )) {
     if (!(required_column %in% colnames(data))) {
       stop(
         "Found dataframe without required column ",
@@ -120,7 +134,7 @@ align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbo
       .fun = prepare_results_row,
       .min_lineage_sequences = .min_lineage_sequences,
       .verbose_output = .verbose_output,
-      .parallel = TRUE
+      .parallel = .parallel_prepare
     ) %>%
     `[`(!is.na(.)) %>%
     unname()
@@ -142,14 +156,17 @@ align_single_df <- function(data, .min_lineage_sequences, .align_threads, .verbo
     mc.cores = .align_threads
   )
 
-  return(convert_results_to_df(results, alignments, .verbose_output))
+  return(convert_results_to_df(results, alignments))
 }
 
 # this function accepts dataframe subset containing rows only for current lineage
 # and returns named list containing 1 row for results dataframe
 prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose_output) {
   cluster_name <- lineage_subset[[1, "Cluster"]]
   germline_seq <- lineage_subset[[1, "Germline.sequence"]]
+  germline_v <- lineage_subset[[1, "V.germline.nt"]]
+  germline_j <- lineage_subset[[1, "J.germline.nt"]]
+  germline_cdr3_len <- lineage_subset[[1, "CDR3.germline.length"]]
   aligned <- nrow(lineage_subset) >= .min_lineage_sequences
 
   if (!aligned & !.verbose_output) {
@@ -163,13 +180,19 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
     lineage_subset[["Sequence"]], lineage_subset[["J.start"]], lineage_subset[["CDR3.end"]]
   )
 
+  sequences_columns <- c(
+    "Sequence", "Clone.ID", "Clones",
+    "CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt"
+  )
   if (.verbose_output) {
-    sequences <- lineage_subset[c("Sequence", "V.end", "J.start", "CDR3.start", "CDR3.end")]
+    sequences_columns %<>% c("V.end", "J.start", "CDR3.start", "CDR3.end")
   }
+  sequences <- lineage_subset[sequences_columns]
+  sequences[["Clone.ID"]] %<>% as.integer()
+  sequences[["Clones"]] %<>% as.integer()
 
-  germline_parts <- strsplit(germline_seq, "N")[[1]]
-  germline_v_len <- stringr::str_length(germline_parts[1])
-  germline_j_len <- stringr::str_length(tail(germline_parts, 1))
+  germline_v_len <- str_length(germline_v)
+  germline_j_len <- str_length(germline_j)
   v_min_len <- min(lineage_subset[["V.lengths"]], germline_v_len)
   j_min_len <- min(lineage_subset[["J.lengths"]], germline_j_len)
 
@@ -181,12 +204,21 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
     lineage_subset[["J.lengths"]],
     j_min_len
   )
-  alignment <- convert_to_dnabin(germline_trimmed, clonotypes_trimmed)
+
+  clonotypes_names <- sapply(lineage_subset[["Clone.ID"]], function(id) {
+    paste0("ID_", id)
+  })
+  all_sequences_list <- c(list(germline_trimmed), as.list(clonotypes_trimmed))
+  names(all_sequences_list) <- c("Germline", clonotypes_names)
+  alignment <- convert_seq_list_to_dnabin(all_sequences_list)
 
   if (.verbose_output) {
     return(list(
       Cluster = cluster_name,
       Germline = germline_seq,
+      V.germline.nt = germline_v,
+      J.germline.nt = germline_j,
+      CDR3.germline.length = germline_cdr3_len,
       Aligned = aligned,
       Alignment = alignment,
       V.length = v_min_len,
@@ -197,43 +229,36 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
     return(list(
       Cluster = cluster_name,
       Germline = germline_seq,
-      Alignment = alignment
+      V.germline.nt = germline_v,
+      J.germline.nt = germline_j,
+      CDR3.germline.length = germline_cdr3_len,
+      Alignment = alignment,
+      V.length = v_min_len,
+      J.length = j_min_len,
+      Sequences = sequences
     ))
   }
 }
 
-convert_to_dnabin <- function(germline_seq, clonotypes) {
-  all_sequences_list <- c(list(germline = germline_seq), as.list(clonotypes))
-  dnabin <- all_sequences_list %>%
-    lapply(
-      function(sequence) {
-        sequence %>%
-          stringr::str_extract_all(stringr::boundary("character")) %>%
-          unlist()
-      }
-    ) %>%
-    ape::as.DNAbin()
-  return(dnabin)
-}
-
 # trim V/J tails in sequence to the specified lenghts v_min, j_min
 trim_seq <- function(seq, v_len, v_min, j_len, j_min) {
-  stringr::str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
+  str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
 }
 
-convert_results_to_df <- function(nested_results_list, nested_alignments_list, .verbose_output) {
+convert_results_to_df <- function(nested_results_list, nested_alignments_list) {
   alignments <- nested_alignments_list %>%
     lapply(magrittr::extract2, "Alignment") %>%
     tibble(Alignment = .)
+  sequences <- nested_results_list %>%
+    lapply(magrittr::extract2, "Sequences") %>%
+    tibble(Sequences = .)
   df <- nested_results_list %>%
     lapply(rlist::list.remove, c("Alignment", "Sequences")) %>%
     purrr::map_dfr(~.) %>%
-    cbind(alignments)
-  if (.verbose_output) {
-    sequences <- nested_results_list %>%
-      lapply(magrittr::extract2, "Sequences") %>%
-      tibble(Sequences = .)
-    df %<>% cbind(sequences)
+    cbind(alignments, sequences)
+  # fix column types after dataframe rebuilding
+  for (column in c("CDR3.germline.length", "V.length", "J.length")) {
+    df[[column]] %<>% as.integer()
   }
   return(df)
 }

diff --git a/R/distance.R b/R/distance.R
@@ -1,5 +1,7 @@
 #' Function for computing distance for sequences
 #'
+#' @concept distance
+#'
 #' @importFrom stringdist stringdistmatrix
 #' @importFrom purrr map pmap map2
 #' @importFrom magrittr %>% %<>% set_attr