Merge pull request #403 from immunomind/dev

Dev
immunomind · Mar 19, 2024 · 0b6544a · 0b6544a
2 parents ac2c840 + 93b6e81
commit 0b6544a
Show file tree

Hide file tree

Showing 27 changed files with 405 additions and 827 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,11 @@
 Package: immunarch
 Type: Package
 Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
-Version: 0.9.0
+Version: 0.9.1
 Authors@R: c(
     person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
     person("Vasily O.", "Tsvetkov", , role = "aut"),
+    person("Siarhei", "Fiadziushchanka", , role = "aut"),
     person("Eugene", "Rumynskiy", , role = "aut"),
     person("Aleksandr A.", "Popov", , role = "aut"),
     person("Ivan", "Balashov", , role = "aut"),
@@ -23,7 +24,7 @@ Description: A comprehensive framework for bioinformatics exploratory analysis o
     and gene segments, repertoire diversity analysis, annotation of clonotypes using external immune receptor
     databases and clonotype tracking in vaccination and cancer studies. A successor to our
     previously published 'tcR' immunoinformatics package (Nazarov 2015) <doi:10.1186/s12859-015-0613-1>.
-License: AGPL-3
+License: Apache License (== 2.0)
 URL: https://immunarch.com/, https://github.com/immunomind/immunarch
 BugReports: https://github.com/immunomind/immunarch/issues
 Imports:
@@ -84,6 +85,6 @@ Suggests:
     rmarkdown
 VignetteBuilder: knitr
 Encoding: UTF-8
-RoxygenNote: 7.2.2
+RoxygenNote: 7.3.1
 LazyData: true
 LazyDataCompression: xz
diff --git a/LICENSE b/LICENSE
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,13 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(cosine_sim,default)
+S3method(cosine_sim,numeric)
+S3method(jaccard_index,character)
+S3method(jaccard_index,default)
+S3method(overlap_coef,character)
+S3method(overlap_coef,default)
+S3method(tversky_index,character)
+S3method(tversky_index,default)
 S3method(vis,clonal_family)
 S3method(vis,clonal_family_tree)
 S3method(vis,immunr_chao1)
@@ -150,6 +158,7 @@ importFrom(dplyr,n)
 importFrom(dplyr,one_of)
 importFrom(dplyr,pull)
 importFrom(dplyr,rename)
+importFrom(dplyr,row_number)
 importFrom(dplyr,rowwise)
 importFrom(dplyr,select)
 importFrom(dplyr,select_)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -2,9 +2,10 @@
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
 fill_vec <- function(read_vec, read_indices) {
-  .Call(`_immunarch_fill_vec`, read_vec, read_indices)
+    .Call(`_immunarch_fill_vec`, read_vec, read_indices)
 }
 
 fill_reads <- function(new_reads, new_counts) {
-  .Call(`_immunarch_fill_reads`, new_reads, new_counts)
+    .Call(`_immunarch_fill_reads`, new_reads, new_counts)
 }
+
diff --git a/R/explore.R b/R/explore.R
@@ -145,5 +145,3 @@ repExplore <- function(.data, .method = c("volume", "count", "len", "clones"), .
 
   res
 }
-
-rep.ex <- repExplore
diff --git a/R/immunarch-remaster.R b/R/immunarch-remaster.R
@@ -0,0 +1,25 @@
+# .check_immundata <- function (.object) {
+#   if (!is.instance(.object, "ImmunData")) {
+#     stop("Error: the input object is not of class ImmunData. Immunarch works on ImmunData only. Helpful manual: ...")
+#   }
+# }
+#
+# .repertoire_overlap <- function (.data, .method, .verbose = TRUE, ...) {
+#   check_immundata(.data)
+# }
+#
+# .gene_usage <- function (.data, .gene, .type, .use_counts, .norm, .gene_vec) {
+#
+# }
+#
+# .repertoire_diversity <- function (.data, .method, .verbose = TRUE, ...) {
+#
+# }
+#
+# .track_clonotypes <- function () {
+#
+# }
+#
+# .public_repertoire <- function () {
+#
+# }
diff --git a/R/io-parsers.R b/R/io-parsers.R
@@ -153,8 +153,10 @@ parse_repertoire <- function(.filename, .mode, .nuc.seq, .aa.seq, .count,
     .vend, .dstart, .dend, .jstart,
     .total.insertions, .vd.insertions, .dj.insertions
   )
-  if (!is.na(.add[1])) {
+  if (!has_no_data(.add)) {
     vec_names <- c(vec_names, .add)
+    # add missing columns
+    df %<>% add_empty_columns(.add[!(.add %in% colnames(df))])
   }
 
   df <- df[, vec_names]
@@ -400,7 +402,7 @@ parse_mitcr <- function(.filename, .mode) {
 }
 
 parse_mixcr <- function(.filename, .mode, .count = c("clonecount", "readcount")) {
-  .filename <- .filename
+  .filename %<>% .as_tsv()
   .id <- "cloneid"
   .count %<>% tolower()
   .sep <- "\t"
@@ -727,6 +729,11 @@ parse_mixcr <- function(.filename, .mode, .count = c("clonecount", "readcount"))
     }
   }
 
+  # fill cloneid column if it not exists
+  if (!(.id %in% colnames(df))) {
+    df %<>% mutate("{.id}" := row_number())
+  }
+
   df <- df[, make.names(df_columns)]
   colnames(df) <- df_column_names
 
@@ -962,13 +969,18 @@ parse_airr <- function(.filename, .mode) {
     .as_tsv() %>%
     airr::read_rearrangement()
 
+  bcr_pipeline_columns <- c(
+    "cdr1", "cdr2", "cdr1_aa", "cdr2_aa", "fwr1", "fwr2", "fwr3", "fwr4",
+    "fwr1_aa", "fwr2_aa", "fwr3_aa", "fwr4_aa"
+  )
   df %<>%
-    select_(
+    add_empty_columns(bcr_pipeline_columns[!(bcr_pipeline_columns %in% colnames(df))]) %>%
+    select(
       "sequence", "v_call", "d_call", "j_call", "junction", "junction_aa",
-      ~contains("v_germline_end"), ~contains("d_germline_start"),
-      ~contains("d_germline_end"), ~contains("j_germline_start"),
-      ~contains("np1_length"), ~contains("np2_length"),
-      ~contains("duplicate_count"),
+      contains("v_germline_end"), contains("d_germline_start"),
+      contains("d_germline_end"), contains("j_germline_start"),
+      contains("np1_length"), contains("np2_length"),
+      contains("duplicate_count"),
       "cdr1", "cdr2", "cdr1_aa", "cdr2_aa", "fwr1", "fwr2", "fwr3", "fwr4",
       "fwr1_aa", "fwr2_aa", "fwr3_aa", "fwr4_aa"
     )

diff --git a/R/io-utility.R b/R/io-utility.R
@@ -1,6 +1,9 @@
 .remove.ext <- function(.str) {
-  # gsub(pattern = '.*/|[.].*$', replacement = '', x = .str)
-  gsub(pattern = ".*/|[.](txt|tsv|csv)$|([.](txt|tsv|csv))?[.](gz|bzip|bzip2|bz2)$", replacement = "", x = .str)
+  .str %<>% str_replace(".*/", "") %>%
+    str_replace(".*\\\\", "") %>%
+    str_replace("(\\.gz|\\.bzip|\\.bzip2|\\.bz2)$", "") %>%
+    str_replace("(\\.txt|\\.tsv|\\.csv)$", "")
+  return(.str)
 }
 
 

diff --git a/R/io.R b/R/io.R
@@ -20,7 +20,7 @@ if (getRversion() >= "2.15.1") {
 #' @importFrom jsonlite read_json
 #' @importFrom stringr str_split str_detect str_replace_all str_trim
 #' @importFrom methods as
-#' @importFrom dplyr contains first select_ group_by_at one_of
+#' @importFrom dplyr contains first select_ group_by_at one_of row_number
 #' @importFrom utils read.table
 #' @importFrom data.table setDF
 #'
@@ -291,13 +291,13 @@ repLoad <- function(.path, .mode = "paired", .coding = TRUE, ...) {
     missed_in_metadata <- setdiff(.metadata$Sample, .rep_names)
     if (length(missed_in_folders) || length(missed_in_metadata)) {
       if (length(missed_in_metadata)) {
-        message("  -- Samples found in the metadata, but not in the folder:\n     ", missed_in_metadata)
+        message("  -- Samples found in the metadata, but not in the folder:\n     ", toString(missed_in_metadata))
         message("  Did you correctly specify all the sample names in the metadata file?")
 
         error_flag <- TRUE
       }
       if (length(missed_in_folders)) {
-        message("  -- Samples found in the folder, but not in the metadata:\n     ", missed_in_folders)
+        message("  -- Samples found in the folder, but not in the metadata:\n     ", toString(missed_in_folders))
         message("  Did you add all the necessary samples to the metadata file with correct names?")
         message("  Creating dummy sample records in the metadata for now...")
 

diff --git a/R/overlap.R b/R/overlap.R
@@ -196,12 +196,14 @@ overlap_coef <- function(.x, .y) {
   UseMethod("overlap_coef")
 }
 
+#' @export
 overlap_coef.default <- function(.x, .y) {
   .x <- collect(.x, n = Inf)
   .y <- collect(.y, n = Inf)
   nrow(dplyr::intersect(.x, .y)) / min(nrow(.x), nrow(.y))
 }
 
+#' @export
 overlap_coef.character <- function(.x, .y) {
   length(dplyr::intersect(.x, .y)) / min(length(.x), length(.y))
 }
@@ -211,13 +213,15 @@ jaccard_index <- function(.x, .y) {
   UseMethod("jaccard_index")
 }
 
+#' @export
 jaccard_index.default <- function(.x, .y) {
   .x <- collect(.x, n = Inf)
   .y <- collect(.y, n = Inf)
   intersection <- nrow(dplyr::intersect(.x, .y))
   intersection / (nrow(.x) + nrow(.y) - intersection)
 }
 
+#' @export
 jaccard_index.character <- function(.x, .y) {
   intersection <- length(dplyr::intersect(.x, .y))
   intersection / (length(.x) + length(.y) - intersection)
@@ -227,13 +231,15 @@ tversky_index <- function(.x, .y, .a = .5, .b = .5) {
   UseMethod("tversky_index")
 }
 
+#' @export
 tversky_index.default <- function(.x, .y, .a = .5, .b = .5) {
   .x <- collect(.x, n = Inf)
   .y <- collect(.y, n = Inf)
   intersection <- nrow(dplyr::intersect(.x, .y))
   intersection / (.a * nrow(dplyr::setdiff(.x, .y)) + .b * nrow(dplyr::setdiff(.y, .x)) + intersection)
 }
 
+#' @export
 tversky_index.character <- function(.x, .y, .a = .5, .b = .5) {
   intersection <- length(dplyr::intersect(.x, .y))
   intersection / (.a * length(dplyr::setdiff(.x, .y)) + .b * length(dplyr::setdiff(.y, .x)) + intersection)
@@ -243,6 +249,7 @@ cosine_sim <- function(.x, .y, .quant) {
   UseMethod("cosine_sim")
 }
 
+#' @export
 cosine_sim.default <- function(.x, .y, .quant) {
   .x <- collect(.x, n = Inf)
   .y <- collect(.y, n = Inf)
@@ -258,6 +265,7 @@ cosine_sim.default <- function(.x, .y, .quant) {
   sum(first_col * second_col) / (sqrt(sum(first_col * first_col)) * sqrt(sum(second_col * second_col)))
 }
 
+#' @export
 cosine_sim.numeric <- function(.x, .y, .quant) {
   df <- rbind(.x, .y)
   sum(.x * .y) / (sqrt(rowSums(df^2))[1] * sqrt(rowSums(df^2))[2])[[1]]

diff --git a/R/sampling.R b/R/sampling.R
@@ -36,7 +36,7 @@
 #'
 #' Note: each connection must represent a separate repertoire.
 #'
-#' @param .method Character. Name of a sampling method. See "Description" for more details. Default value is "downsample"
+#' @param .method Character. Name of a sampling method. See "Details" for more details. Default value is "downsample"
 #' that downsamples the repertoires to the number of clones (i.e., reads / UMIs) that the smallest repertoire has, if user
 #' doesn't set any value to the ".n" argument.
 #'

diff --git a/R/seqCluster.R b/R/seqCluster.R
@@ -143,10 +143,11 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th
     if (!all(is.na(grouping_cols))) {
       result_multi %<>% map2_df(., pmap(group_values, data.frame)[!singleseq_flag], ~ cbind(.x, .y))
       res <- rbind(result_single, result_multi)
-      res[grouping_cols] <- str_split(str_split(res[["Cluster"]],
-        pattern = "_", simplify = TRUE
-      )[, 1],
-      pattern = "/", simplify = TRUE
+      res[grouping_cols] <- str_split(
+        str_split(res[["Cluster"]],
+          pattern = "_", simplify = TRUE
+        )[, 1],
+        pattern = "/", simplify = TRUE
       )[, seq_along(grouping_cols)]
     } else {
       result_multi %<>% map_df(., ~.x)

diff --git a/R/shiny.R b/R/shiny.R
@@ -343,7 +343,6 @@ fixVis <- function(.plot = NA) {
   #
   server <- function(input, output, session) {
     create_plot <- function(input) {
-
       # TODO: make automatic detection of available themes from ggplot2 and other packages
       choose_theme <- function(theme_label) {
         switch(theme_label,

diff --git a/R/tools.R b/R/tools.R
@@ -596,6 +596,17 @@ add_column_with_first_gene <- function(.data, .original_colname, .target_colname
   return(.data)
 }
 
+# add columns filled with NA
+add_empty_columns <- function(.data, .colnames) {
+  if (length(.colnames) > 0) {
+    new_columns <- rep(list(NA), length(.colnames))
+    names(new_columns) <- .colnames
+    return(do.call(cbind, c(list(.data), new_columns)))
+  } else {
+    return(.data)
+  }
+}
+
 # used to add sample name to error/warning messages when sample name is available
 optional_sample <- function(prefix, sample_name, suffix) {
   if (is.na(sample_name) || (sample_name == "")) {

diff --git a/R/vis.R b/R/vis.R
@@ -7,7 +7,7 @@ if (getRversion() >= "2.15.1") {
     "Overlap", "head", "Mean", "MeanVal", "MinVal", "MaxVal",
     "Q1", "Q2", "Type", "Length", "Gene", "Freq", "Sequence",
     "AA", "Clones", "Source.gr", "Target.gr", "Samples", "Samples.y",
-    "CDR3.aa", "p.adj", "group1", "group2", "y.coord", "..p.adj..", ".SD",
+    "CDR3.aa", "p.adj", "group1", "group2", "y.coord", ".SD",
     "name", "label", "."
   ))
 }
@@ -47,15 +47,11 @@ if (getRversion() >= "2.15.1") {
 
 
 .tweak_fill <- function(.n) {
-  palette_name <- ""
   if (.n == 1) {
     palette_name <- "Set2"
   } else if (.n == 2) {
     palette_name <- "Set1"
-  }
-  # else if (.n < 4) { palette_name = "YlGnBu" }
-  # else if (.n < 6) {  palette_name = "RdBu" }
-  else if (.n < 12) {
+  } else if (.n < 12) {
     palette_name <- "Spectral"
   } else {
     return(scale_fill_hue())
@@ -65,15 +61,11 @@ if (getRversion() >= "2.15.1") {
 }
 
 .tweak_col <- function(.n) {
-  palette_name <- ""
   if (.n == 1) {
     palette_name <- "Set2"
   } else if (.n == 2) {
     palette_name <- "Set1"
-  }
-  # else if (.n < 4) { palette_name = "YlGnBu" }
-  # else if (.n < 6) {  palette_name = "RdBu" }
-  else if (.n < 12) {
+  } else if (.n < 12) {
     palette_name <- "Spectral"
   } else {
     return(scale_colour_hue())
@@ -1469,7 +1461,7 @@ vis_box <- function(.data, .by = NA, .meta = NA, .melt = TRUE,
         # print(p_df)
 
         p <- p +
-          stat_compare_means(aes(label = ..p.adj..),
+          stat_compare_means(aes(label = after_stat(p.adj)),
             bracket.size = .5, size = .signif.label.size,
             label.y = max(.data$Value, na.rm = TRUE) * 1.07
           )
@@ -2188,7 +2180,7 @@ vis_bar <- function(.data, .by = NA, .meta = NA, .errorbars = c(0.025, 0.975), .
         # print(p_df)
 
         p <- p +
-          stat_compare_means(aes(label = ..p.adj..),
+          stat_compare_means(aes(label = after_stat(p.adj)),
             bracket.size = .5, size = .signif.label.size,
             label.y = max(.data$Value, na.rm = TRUE) * 1.07
           )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -145,5 +145,3 @@ repExplore <- function(.data, .method = c("volume", "count", "len", "clones"), .

		res
		}

		rep.ex <- repExplore