From 6f7ffbb21688d967b03c9d2b98839e79cd6a2357 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Tue, 18 Oct 2022 17:33:49 +0200
Subject: [PATCH 01/18] version bump

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 71393fc4..1b264a25 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: immunarch
 Type: Package
 Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
-Version: 0.8.0
+Version: 0.9.0
 Authors@R: c(
     person("Vadim I.", "Nazarov", , "support@immunomind.io", c("aut", "cre")),
     person("Vasily O.", "Tsvetkov", , role = "aut"),

From 4526ff791912f0a49c22fecbc153bd8cbfad3a69 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Thu, 27 Oct 2022 18:40:57 +0200
Subject: [PATCH 02/18] Fix for phylip dnapars on Windows

---
 R/phylip.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/R/phylip.R b/R/phylip.R
index 7f63a0ac..6fa02a7b 100644
--- a/R/phylip.R
+++ b/R/phylip.R
@@ -60,7 +60,7 @@
 #'   repClonalFamily(.threads = 1, .nofail = TRUE)
 #' @export repClonalFamily
 repClonalFamily <- function(.data, .threads = parallel::detectCores(), .nofail = FALSE) {
-  if (!require_system_package("phylip", error_message = paste0(
+  if (!require_system_package(c("phylip", "dnapars"), error_message = paste0(
     "repLineagePhylogeny requires PHYLIP app to be installed!\n",
     "Please install it as described here:\n",
     "https://evolution.genetics.washington.edu/phylip/install.html"
@@ -127,8 +127,9 @@ process_cluster <- function(cluster_row) {
   # these \t are also used to read outfile as table
   rownames(alignment) %<>% paste0("\t")
   phangorn::write.phyDat(alignment, file.path(temp_dir, "infile"))
+  dnapars <- if (Sys.which("phylip") == "") "dnapars" else "phylip dnapars"
   system(
-    paste0("sh -c \"cd ", temp_dir, "; phylip dnapars infile\""),
+    paste0("sh -c \"cd ", temp_dir, "; ", dnapars, " infile\""),
     input = (c("V", 1, 5, ".", "Y"))
   ) %>%
     quiet(capture_output = TRUE)

From 9c0baf2a3cc193ae351c938c5cd48273b7c40c65 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Thu, 27 Oct 2022 19:57:41 +0200
Subject: [PATCH 03/18] fix for path separators on Windows

---
 R/phylip.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/phylip.R b/R/phylip.R
index 6fa02a7b..c268ebff 100644
--- a/R/phylip.R
+++ b/R/phylip.R
@@ -121,12 +121,13 @@ process_cluster <- function(cluster_row) {
   sequences <- cluster_row[["Sequences"]][[1]]
   cluster_name <- cluster_row[["Cluster"]]
 
-  temp_dir <- file.path(tempdir(check = TRUE), uuid::UUIDgenerate(use.time = FALSE))
+  fsep <- if (.Platform$OS.type == "windows") "\\" else "/"
+  temp_dir <- file.path(tempdir(check = TRUE), uuid::UUIDgenerate(use.time = FALSE), fsep = fsep)
   dir.create(temp_dir)
   # workaround for phylip: it shows "Unexpected end-of-file" for too short sequence labels;
   # these \t are also used to read outfile as table
   rownames(alignment) %<>% paste0("\t")
-  phangorn::write.phyDat(alignment, file.path(temp_dir, "infile"))
+  phangorn::write.phyDat(alignment, file.path(temp_dir, "infile", fsep = fsep))
   dnapars <- if (Sys.which("phylip") == "") "dnapars" else "phylip dnapars"
   system(
     paste0("sh -c \"cd ", temp_dir, "; ", dnapars, " infile\""),
@@ -134,9 +135,9 @@ process_cluster <- function(cluster_row) {
   ) %>%
     quiet(capture_output = TRUE)
 
-  tree <- ape::read.tree(file.path(temp_dir, "outtree"))
+  tree <- ape::read.tree(file.path(temp_dir, "outtree", fsep = fsep))
 
-  outfile_path <- file.path(temp_dir, "outfile")
+  outfile_path <- file.path(temp_dir, "outfile", fsep = fsep)
   outfile_table <- read.table(outfile_path,
     sep = "\t", header = FALSE, na.strings = "", stringsAsFactors = FALSE,
     fill = TRUE, blank.lines.skip = TRUE

From 40536a130db1ce7ec1fa789025a8565ebf242871 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Thu, 27 Oct 2022 20:05:19 +0200
Subject: [PATCH 04/18] shell fix for Windows

---
 R/phylip.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/phylip.R b/R/phylip.R
index c268ebff..f88b7b23 100644
--- a/R/phylip.R
+++ b/R/phylip.R
@@ -122,6 +122,7 @@ process_cluster <- function(cluster_row) {
   cluster_name <- cluster_row[["Cluster"]]
 
   fsep <- if (.Platform$OS.type == "windows") "\\" else "/"
+  shell <- if (.Platform$OS.type == "windows") "powershell /c " else "sh -c "
   temp_dir <- file.path(tempdir(check = TRUE), uuid::UUIDgenerate(use.time = FALSE), fsep = fsep)
   dir.create(temp_dir)
   # workaround for phylip: it shows "Unexpected end-of-file" for too short sequence labels;
@@ -130,7 +131,7 @@ process_cluster <- function(cluster_row) {
   phangorn::write.phyDat(alignment, file.path(temp_dir, "infile", fsep = fsep))
   dnapars <- if (Sys.which("phylip") == "") "dnapars" else "phylip dnapars"
   system(
-    paste0("sh -c \"cd ", temp_dir, "; ", dnapars, " infile\""),
+    paste0(shell, "\"cd ", temp_dir, "; ", dnapars, " infile\""),
     input = (c("V", 1, 5, ".", "Y"))
   ) %>%
     quiet(capture_output = TRUE)

From 5c75838c92e7445791098f4f55a0fcc063ebe9cb Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Thu, 3 Nov 2022 16:01:32 +0100
Subject: [PATCH 05/18] Added support for groups coloring on repClonalFamily()
 visualization

---
 R/phylip.R             | 48 +++++++++++++++++++++++++++++++++++++-----
 man/repClonalFamily.Rd | 10 ++++++++-
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/R/phylip.R b/R/phylip.R
index f88b7b23..b96bba88 100644
--- a/R/phylip.R
+++ b/R/phylip.R
@@ -21,10 +21,18 @@
 #'
 #' @usage
 #'
-#' repClonalFamily(.data, .threads, .nofail)
+#' repClonalFamily(.data, .vis_groups, .threads, .nofail)
 #'
 #' @param .data The data to be processed, output of repAlignLineage() function.
 #'
+#' @param .vis_groups Groups for visualization, used to annotate specific clones on chart
+#' and display them in different colors. This is a named list, where names are for the chart legend,
+#' and list items are clone IDs that belong to the groups. It's not necessary to assign groups to
+#' all clonotypes; unassigned ones will be displayed on the chart as "Clonotype" category.
+#' It's also possible to assign multiple clonotypes to the same group by providing nested lists
+#' or vectors of clone IDs instead of single clone IDs.
+#' Example: .vis_groups = list(A = 817, B = 201, C = list(303, 42))
+#'
 #' @param .threads Number of threads to use.
 #'
 #' @param .nofail Returns NA instead of stopping if PHYLIP is not installed.
@@ -59,7 +67,10 @@
 #'   repAlignLineage(.min_lineage_sequences = 2, .align_threads = 2, .nofail = TRUE) %>%
 #'   repClonalFamily(.threads = 1, .nofail = TRUE)
 #' @export repClonalFamily
-repClonalFamily <- function(.data, .threads = parallel::detectCores(), .nofail = FALSE) {
+repClonalFamily <- function(.data,
+                            .vis_groups = NA,
+                            .threads = parallel::detectCores(),
+                            .nofail = FALSE) {
   if (!require_system_package(c("phylip", "dnapars"), error_message = paste0(
     "repLineagePhylogeny requires PHYLIP app to be installed!\n",
     "Please install it as described here:\n",
@@ -68,18 +79,34 @@ repClonalFamily <- function(.data, .threads = parallel::detectCores(), .nofail =
     return(get_empty_object_with_class("step_failure_ignored"))
   }
 
+  if (has_no_data(.vis_groups)) {
+    vis_groups <- NA
+  } else {
+    # switch keys and values of vis_groups and extract nested lists of clone IDs
+    vis_groups <- list()
+    for (i in seq_along(.vis_groups)) {
+      group_name <- names(.vis_groups)[i]
+      for (clone_id in unlist(.vis_groups[[i]])) {
+        vis_groups_element <- group_name
+        names(vis_groups_element) <- clone_id
+        vis_groups %<>% append(vis_groups_element)
+      }
+    }
+  }
+
   results <- .data %>%
     apply_to_sample_or_list(
       process_dataframe,
       .with_names = TRUE,
       .validate = FALSE,
+      vis_groups = vis_groups,
       .threads = .threads
     ) %>%
     add_class("clonal_family")
   return(results)
 }
 
-process_dataframe <- function(df, .threads, sample_name = NA) {
+process_dataframe <- function(df, vis_groups, .threads, sample_name = NA) {
   required_columns <- c("Cluster", "Germline", "Alignment", "Sequences")
   for (column in required_columns) {
     if (!(column %in% colnames(df))) {
@@ -108,13 +135,14 @@ process_dataframe <- function(df, .threads, sample_name = NA) {
     clusters_list,
     process_cluster,
     mc.preschedule = FALSE,
-    mc.cores = .threads
+    mc.cores = .threads,
+    vis_groups = vis_groups
   ) %>%
     convert_nested_to_df()
   return(results)
 }
 
-process_cluster <- function(cluster_row) {
+process_cluster <- function(cluster_row, vis_groups) {
   # alignment, sequences and aa_frame_starts should be extracted from 1-element lists
   # because of these columns format
   alignment <- cluster_row[["Alignment"]][[1]]
@@ -293,6 +321,16 @@ process_cluster <- function(cluster_row) {
       }
       tree_stats[row, "DistanceAA"] <- sum(seq_aa_chars != ancestor_aa_chars)
     }
+
+    # assign visualization group for current sequence if specified
+    if (!has_no_data(vis_groups)) {
+      if (tree_stats[row, "Type"] == "Clonotype") {
+        seq_id <- substring(tree_stats[row, "Name"], 4)
+        if (seq_id %in% names(vis_groups)) {
+          tree_stats[row, "Type"] <- vis_groups[[seq_id]]
+        }
+      }
+    }
   }
 
   for (column in c("Clones", "DistanceNT", "DistanceAA")) {
diff --git a/man/repClonalFamily.Rd b/man/repClonalFamily.Rd
index 5bf5099e..1f1adb78 100644
--- a/man/repClonalFamily.Rd
+++ b/man/repClonalFamily.Rd
@@ -4,11 +4,19 @@
 \alias{repClonalFamily}
 \title{Builds a phylogenetic tree using the sequences of a clonal lineage}
 \usage{
-repClonalFamily(.data, .threads, .nofail)
+repClonalFamily(.data, .vis_groups, .threads, .nofail)
 }
 \arguments{
 \item{.data}{The data to be processed, output of repAlignLineage() function.}
 
+\item{.vis_groups}{Groups for visualization, used to annotate specific clones on chart
+and display them in different colors. This is a named list, where names are for the chart legend,
+and list items are clone IDs that belong to the groups. It's not necessary to assign groups to
+all clonotypes; unassigned ones will be displayed on the chart as "Clonotype" category.
+It's also possible to assign multiple clonotypes to the same group by providing nested lists
+or vectors of clone IDs instead of single clone IDs.
+Example: .vis_groups = list(A = 817, B = 201, C = list(303, 42))}
+
 \item{.threads}{Number of threads to use.}
 
 \item{.nofail}{Returns NA instead of stopping if PHYLIP is not installed.

From e836edf19bde633453c5be8f4df6f9e875705daf Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Wed, 16 Nov 2022 17:38:44 +0100
Subject: [PATCH 06/18] repDiversity bugfixes

---
 R/diversity.R | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/R/diversity.R b/R/diversity.R
index 9fe80913..8c965cee 100644
--- a/R/diversity.R
+++ b/R/diversity.R
@@ -188,7 +188,6 @@ repDiversity <- function(.data, .method = "chao1", .col = "aa", .max.q = 6, .min
       .data <- list(Sample = .data)
     }
 
-
     .col <- process_col_argument(.col) # ToDo: refactor this and the next branches
 
     vec <- lapply(.data, function(x) {
@@ -196,7 +195,7 @@ repDiversity <- function(.data, .method = "chao1", .col = "aa", .max.q = 6, .min
         x <- x %>% lazy_dt()
       }
       x %>%
-        select(.col, IMMCOL$count) %>%
+        select(all_of(.col), IMMCOL$count) %>%
         group_by_at(vars(.col)) %>%
         summarise(Div.count = sum(!!sym(IMMCOL$count))) %>%
         pull(Div.count)
@@ -350,20 +349,20 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
   if (is.na(.step)) {
     .step <- min(sapply(.data, function(x) sum(as.numeric(x)))) %/% 50.
   }
+  .step <- max(1, .step)
 
   if (is.na(.extrapolation)) {
     .extrapolation <- max(sapply(.data, function(x) sum(as.numeric(x)))) * 20
   }
 
   .alpha <- function(n, Xi, m) {
-    k <- Xi
     return((1 - m / n)^Xi)
   }
 
   if (.verbose) {
     pb <- set_pb(sum(sapply(seq_along(.data), function(i) {
       bc.vec <- .data[[i]]
-      bc.sum <- sum(.data[[i]])
+      bc.sum <- sum(bc.vec)
       sizes <- seq(.step, bc.sum, .step)
       if (sizes[length(sizes)] != bc.sum) {
         sizes <- c(sizes, bc.sum)
@@ -381,9 +380,7 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
     }
     n <- sum(bc.vec)
     sizes <- seq(.step, n, .step)
-    # if (sizes[length(sizes)] != n) {
-    #   sizes <- c(sizes, n)
-    # }
+
     counts <- table(bc.vec)
     muc.res <- t(sapply(sizes, function(sz) {
       freqs <- as.numeric(names(counts))
@@ -413,7 +410,6 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
     }))
 
     if (.extrapolation > 0) {
-      # sizes <- seq(sum(.data[[i]]), .extrapolation + max(sapply(.data, function (x) sum(x))), .step)
       sizes <- seq(
         tail(seq(.step, sum(.data[[i]]), .step), 1) + .step,
         .extrapolation,

From b616d6de08ae877c9f4dc571868f10e5e683211f Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Fri, 18 Nov 2022 13:38:20 +0100
Subject: [PATCH 07/18] Support for MiXCR 4.1 format and manual count column
 selection

---
 R/io-parsers.R | 11 ++++++++---
 R/io.R         | 10 ++++++----
 man/repLoad.Rd |  4 +++-
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/R/io-parsers.R b/R/io-parsers.R
index 2d574741..6fc99c1f 100644
--- a/R/io-parsers.R
+++ b/R/io-parsers.R
@@ -399,10 +399,10 @@ parse_mitcr <- function(.filename, .mode) {
   )
 }
 
-parse_mixcr <- function(.filename, .mode) {
+parse_mixcr <- function(.filename, .mode, .count = c("clonecount", "readcount")) {
   .filename <- .filename
   .id <- "cloneid"
-  .count <- "clonecount"
+  .count %<>% tolower()
   .sep <- "\t"
   .vd.insertions <- "VD.insertions"
   .dj.insertions <- "DJ.insertions"
@@ -677,7 +677,7 @@ parse_mixcr <- function(.filename, .mode) {
     df[[pos_extra_headers[["j3del"]]]] <- sapply(df[["refpoints"]], get_ref_point_position, 18)
   }
 
-  if (!(.count %in% table.colnames)) {
+  if (!any(.count %in% table.colnames)) {
     warn_msg <- c("  [!] Warning: can't find a column with clonal counts. Setting all clonal counts to 1.")
     warn_msg <- c(warn_msg, "\n      Did you apply repLoad to MiXCR file *_alignments.txt?")
     warn_msg <- c(warn_msg, " If so please consider moving all *.clonotypes.*.txt MiXCR files to")
@@ -685,8 +685,13 @@ parse_mixcr <- function(.filename, .mode) {
     warn_msg <- c(warn_msg, "\n      Note: The *_alignments.txt file IS NOT a repertoire file suitable for any analysis.")
     message(warn_msg)
 
+    .count <- .count[1]
     df[[.count]] <- 1
+  } else if (length(.count) > 1) {
+    # if multiple column name options specified for .count, keep only the first valid
+    .count <- .count[.count %in% table.colnames][1]
   }
+
   .freq <- "Proportion"
   df$Proportion <- df[[.count]] / sum(df[[.count]], na.rm = TRUE)
 
diff --git a/R/io.R b/R/io.R
index aa1507aa..0b607537 100644
--- a/R/io.R
+++ b/R/io.R
@@ -59,6 +59,8 @@ if (getRversion() >= "2.15.1") {
 #'
 #' @param .coding A logical value. Set TRUE to get coding-only clonotypes (by defaul). Set FALSE to get all clonotypes.
 #'
+#' @param ... Extra arguments for parsing functions
+#'
 #' @details
 #' The metadata has to be a tab delimited file with first column named "Sample".
 #' It can have any number of additional columns with arbitrary names.
@@ -135,14 +137,14 @@ if (getRversion() >= "2.15.1") {
 #' # > names(immdata)
 #' # [1] "data" "meta"
 #' @export repLoad
-repLoad <- function(.path, .mode = "paired", .coding = TRUE) {
+repLoad <- function(.path, .mode = "paired", .coding = TRUE, ...) {
   exclude_extensions <- c(
     "so", "exe", "bam", "fasta", "fai", "fastq", "bed", "rds", "report", "vdjca"
   )
 
   # Process a repertoire file: detect format and load the data
   # Return: a named list with a repertoire data frame and it's name
-  .read_repertoire <- function(.path, .mode, .coding) {
+  .read_repertoire <- function(.path, .mode, .coding, ...) {
     parse_res <- list()
     cur_format <- .detect_format(.path)
 
@@ -176,7 +178,7 @@ repLoad <- function(.path, .mode = "paired", .coding = TRUE) {
       if (suppressWarnings(is.na(parse_fun))) {
         message("unknown format, skipping")
       } else {
-        parse_res <- parse_fun(.path, .mode)
+        parse_res <- parse_fun(.path, .mode, ...)
 
         if (is.null(parse_res)) {
           message("  [!] Warning: zero clonotypes found, skipping")
@@ -233,7 +235,7 @@ repLoad <- function(.path, .mode = "paired", .coding = TRUE) {
         } else if (stringr::str_detect(.filepath, "barcode")) {
           # TODO: add the barcode processing subroutine to split by samples
         } else {
-          repertoire <- .read_repertoire(.filepath, .mode, .coding)
+          repertoire <- .read_repertoire(.filepath, .mode, .coding, ...)
           if (length(repertoire) != 0) {
             parsed_batch <- c(parsed_batch, repertoire)
           }
diff --git a/man/repLoad.Rd b/man/repLoad.Rd
index cb302865..6d9eeaf8 100644
--- a/man/repLoad.Rd
+++ b/man/repLoad.Rd
@@ -4,7 +4,7 @@
 \alias{repLoad}
 \title{Load immune repertoire files into the R workspace}
 \usage{
-repLoad(.path, .mode = "paired", .coding = TRUE)
+repLoad(.path, .mode = "paired", .coding = TRUE, ...)
 }
 \arguments{
 \item{.path}{A character string specifying the path to the input data.
@@ -33,6 +33,8 @@ Currently "single" works for every format, and "paired" works only for 10X Genom
 By default, 10X Genomics data will be loaded as paired chain data, and other files will be loaded as single chain data.}
 
 \item{.coding}{A logical value. Set TRUE to get coding-only clonotypes (by defaul). Set FALSE to get all clonotypes.}
+
+\item{...}{Extra arguments for parsing functions}
 }
 \value{
 A list with two named elements:

From a475912ca7b2f17e15f41d0f7ef326bc1a4cbe19 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Mon, 21 Nov 2022 23:27:02 +0100
Subject: [PATCH 08/18] Tests for repAlignLineage; small fixes

---
 NAMESPACE                           |   1 -
 R/align_lineage.R                   |   3 +-
 tests/testthat/test-align-lineage.R | 100 ++++++++++++++++++++++++++++
 tests/testthat/test-germline.R      |   2 +-
 4 files changed, 102 insertions(+), 4 deletions(-)
 create mode 100644 tests/testthat/test-align-lineage.R

diff --git a/NAMESPACE b/NAMESPACE
index 1db20f3b..d6c0a501 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -296,7 +296,6 @@ importFrom(utils,capture.output)
 importFrom(utils,packageVersion)
 importFrom(utils,read.table)
 importFrom(utils,setTxtProgressBar)
-importFrom(utils,str)
 importFrom(utils,tail)
 importFrom(utils,txtProgressBar)
 importFrom(uuid,UUIDgenerate)
diff --git a/R/align_lineage.R b/R/align_lineage.R
index 3d89590c..a38ecdee 100644
--- a/R/align_lineage.R
+++ b/R/align_lineage.R
@@ -9,7 +9,6 @@
 #' @importFrom plyr dlply .
 #' @importFrom purrr map_dfr
 #' @importFrom rlist list.remove
-#' @importFrom utils str
 #' @importFrom ape as.DNAbin clustal
 #' @importFrom doParallel registerDoParallel stopImplicitCluster
 #' @importFrom parallel mclapply
@@ -117,7 +116,7 @@ align_single_df <- function(data,
         "Found dataframe without required column ",
         required_column,
         ";\nexisting columns: ",
-        utils::str(colnames(data))
+        toString(colnames(data))
       )
     }
   }
diff --git a/tests/testthat/test-align-lineage.R b/tests/testthat/test-align-lineage.R
new file mode 100644
index 00000000..3b111f53
--- /dev/null
+++ b/tests/testthat/test-align-lineage.R
@@ -0,0 +1,100 @@
+data(bcrdata)
+test_bcr_data <- bcrdata$data %>% top(1000)
+test_input <- test_bcr_data %>%
+  seqCluster(seqDist(test_bcr_data), .fixed_threshold = 3) %>%
+  repGermline(.threads = 1) %>%
+  suppressWarnings()
+
+positive_test_cases <- list(
+  "Not empty result" = list(
+    args = list(
+      .data = test_input,
+      .min_lineage_sequences = 2,
+      .prepare_threads = 1,
+      .align_threads = 1
+    ),
+    assert_function = function(result) {
+      expect_equal(immunarch:::has_no_data(result), FALSE)
+      expect_equal(nrow(result[["full_clones"]]) > 0, TRUE)
+    }
+  ),
+  "Multiple threads" = list(
+    args = list(
+      .data = test_input,
+      .min_lineage_sequences = 2
+    ),
+    assert_function = function(result) {
+      expect_equal(immunarch:::has_no_data(result), FALSE)
+      expect_equal(nrow(result[["full_clones"]]) > 0, TRUE)
+    }
+  ),
+  "Dataframe only" = list(
+    args = list(
+      .data = test_input[["full_clones"]],
+      .min_lineage_sequences = 2,
+      .prepare_threads = 1,
+      .align_threads = 1
+    ),
+    assert_function = function(result) {
+      expect_equal(immunarch:::has_no_data(result), FALSE)
+      expect_equal(nrow(result) > 0, TRUE)
+    }
+  )
+)
+
+for (i in seq_along(positive_test_cases)) {
+  # Arrange
+  test_name <- names(positive_test_cases)[i]
+  args <- positive_test_cases[[i]][["args"]]
+  assert_function <- positive_test_cases[[i]][["assert_function"]]
+
+  # Act
+  result <- suppressWarnings(do.call(repAlignLineage, args))
+
+  # Assert
+  test_that(
+    test_name,
+    assert_function(result)
+  )
+}
+
+negative_test_cases <- list(
+  "List of lists" = list(
+    args = list(
+      .data = bcrdata
+    )
+  ),
+  "Missing columns" = list(
+    args = list(
+      .data = test_bcr_data[["full_clones"]],
+      .prepare_threads = 1,
+      .align_threads = 1
+    )
+  ),
+  "Missing Cluster column" = list(
+    args = list(
+      .data = subset(test_input[["full_clones"]], select = -c(get("Cluster"))),
+      .prepare_threads = 1,
+      .align_threads = 1
+    )
+  ),
+  "Missing Germline.sequence column" = list(
+    args = list(
+      .data = subset(test_input[["full_clones"]], select = -c(get("Germline.sequence"))),
+      .prepare_threads = 1,
+      .align_threads = 1
+    )
+  )
+)
+
+for (i in seq_along(negative_test_cases)) {
+  # Arrange
+  test_name <- names(negative_test_cases)[i]
+  args <- negative_test_cases[[i]][["args"]]
+
+  # Act, Assert
+  test_that(
+    test_name,
+    expect_error(suppressWarnings(do.call(repAlignLineage, args)))
+  )
+}
diff --git a/tests/testthat/test-germline.R b/tests/testthat/test-germline.R
index aae7fe0d..124ddd62 100644
--- a/tests/testthat/test-germline.R
+++ b/tests/testthat/test-germline.R
@@ -57,7 +57,7 @@ negative_test_cases <- list(
   ),
   "Missing column" = list(
     args = list(
-      .data = subset(test_bcr_data[["full_clones"]], select = -c(FR1.nt)),
+      .data = subset(test_bcr_data[["full_clones"]], select = -c(get("FR1.nt"))),
       .threads = 1
     )
   )

From 0a45d088360f525ec9f84f52b01c3cd7e6b85c73 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Fri, 25 Nov 2022 13:41:48 +0100
Subject: [PATCH 09/18] 10x_filt_contigs parser: fix for groups with same
 sequences in different order

---
 R/io-parsers.R | 19 +++++++++++++++----
 R/tools.R      |  4 ++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/R/io-parsers.R b/R/io-parsers.R
index 6fc99c1f..e0c446ec 100644
--- a/R/io-parsers.R
+++ b/R/io-parsers.R
@@ -1053,7 +1053,7 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
   setDT(df)
 
   if (.mode == "paired") {
-    df <- df %>%
+    df %<>%
       lazy_dt() %>%
       group_by(barcode, raw_clonotype_id) %>%
       summarise(
@@ -1072,13 +1072,21 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
       as.data.table()
   }
 
-  df <- df %>%
+  df %<>%
     lazy_dt() %>%
-    group_by(CDR3.nt, V.name, J.name) %>%
+    mutate(
+      CDR3.nt.sorted = sort_string(get("CDR3.nt"), IMMCOL_ADD$scsep),
+      V.name.sorted = sort_string(get("V.name"), IMMCOL_ADD$scsep),
+      J.name.sorted = sort_string(get("J.name"), IMMCOL_ADD$scsep)
+    ) %>%
+    group_by(CDR3.nt.sorted, V.name.sorted, J.name.sorted) %>%
     summarise(
       Clones = length(unique(get("barcode"))),
+      CDR3.nt = first(get("CDR3.nt")),
       CDR3.aa = first(get("CDR3.aa")),
+      V.name = first(get("V.name")),
       D.name = first(get("D.name")),
+      J.name = first(get("J.name")),
       chain = first(get("chain")),
       barcode = paste0(unique(get("barcode")), collapse = IMMCOL_ADD$scsep),
       raw_clonotype_id = gsub(
@@ -1088,7 +1096,10 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
       contig_id = paste0(get("contig_id"), collapse = IMMCOL_ADD$scsep),
       c_gene = first(get("c_gene"))
     ) %>%
-    as.data.table()
+    as.data.table() %>%
+    subset(
+      select = -c(get("CDR3.nt.sorted"), get("V.name.sorted"), get("J.name.sorted"))
+    )
 
   df$V.end <- NA
   df$J.start <- NA
diff --git a/R/tools.R b/R/tools.R
index 5bf9e8d3..b7fc8976 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -486,6 +486,10 @@ as_numeric_or_fail <- function(.string) {
   return(result)
 }
 
+sort_string <- function(.string, .delim) {
+  map_chr(strsplit(.string, .delim), ~ paste(sort(.x), collapse = .delim))
+}
+
 has_no_data <- function(.data) {
   any(sapply(list(NA, NULL, NaN), identical, .data)) | all(is.na(.data))
 }

From b2aa6b6febd55461b6c57ba0b68739f96033c33e Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Tue, 29 Nov 2022 22:50:53 +0100
Subject: [PATCH 10/18] Tests for repClonalFamily()

---
 tests/testthat/test-clonal-family.R | 115 ++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 tests/testthat/test-clonal-family.R

diff --git a/tests/testthat/test-clonal-family.R b/tests/testthat/test-clonal-family.R
new file mode 100644
index 00000000..3903a4b0
--- /dev/null
+++ b/tests/testthat/test-clonal-family.R
@@ -0,0 +1,115 @@
+data(bcrdata)
+test_bcr_data <- bcrdata$data %>% top(1000)
+test_input <- test_bcr_data %>%
+  seqCluster(seqDist(test_bcr_data), .fixed_threshold = 3) %>%
+  repGermline(.threads = 1) %>%
+  repAlignLineage(.min_lineage_sequences = 2, .prepare_threads = 1, .align_threads = 1) %>%
+  suppressWarnings()
+
+positive_test_cases <- list(
+  "Not empty result" = list(
+    args = list(
+      .data = test_input,
+      .threads = 1
+    ),
+    assert_function = function(result) {
+      expect_equal(immunarch:::has_no_data(result), FALSE)
+      expect_equal(nrow(result[["full_clones"]]) > 0, TRUE)
+    }
+  ),
+  "Multiple threads" = list(
+    args = list(
+      .data = test_input,
+      .threads = 8
+    ),
+    assert_function = function(result) {
+      expect_equal(immunarch:::has_no_data(result), FALSE)
+      expect_equal(nrow(result[["full_clones"]]) > 0, TRUE)
+    }
+  ),
+  "Dataframe only" = list(
+    args = list(
+      .data = test_input[["full_clones"]],
+      .threads = 1
+    ),
+    assert_function = function(result) {
+      expect_equal(immunarch:::has_no_data(result), FALSE)
+      expect_equal(nrow(result) > 0, TRUE)
+    }
+  ),
+  "Vis groups" = list(
+    args = list(
+      .data = test_input,
+      .vis_groups = {
+        clone_ids <- test_input[["full_clones"]] %>%
+          unnest("Sequences") %>%
+          extract2("Clone.ID")
+        list(
+          Group1 = clone_ids[1],
+          Group2 = clone_ids[3],
+          Group3 = list(clone_ids[5], clone_ids[2]),
+          Group4 = c(clone_ids[7], clone_ids[4])
+        )
+      },
+      .threads = 1
+    ),
+    assert_function = function(result) {
+      types <- result[["full_clones"]] %>%
+        unnest("TreeStats") %>%
+        extract2("Type")
+      # check that correct number of clonotypes is assigned to each group
+      expect_equal(tabulate(match(types, "Group1")), 1)
+      expect_equal(tabulate(match(types, "Group2")), 1)
+      expect_equal(tabulate(match(types, "Group3")), 2)
+      expect_equal(tabulate(match(types, "Group4")), 2)
+    }
+  )
+)
+
+for (i in seq_along(positive_test_cases)) {
+  # Arrange
+  test_name <- names(positive_test_cases)[i]
+  args <- positive_test_cases[[i]][["args"]]
+  assert_function <- positive_test_cases[[i]][["assert_function"]]
+
+  # Act
+  result <- do.call(repClonalFamily, args)
+
+  # Assert
+  test_that(
+    test_name,
+    assert_function(result)
+  )
+}
+
+negative_test_cases <- list(
+  "List of lists" = list(
+    args = list(
+      .data = bcrdata
+    )
+  ),
+  "Missing columns" = list(
+    args = list(
+      .data = test_bcr_data[["full_clones"]],
+      .threads = 1
+    )
+  ),
+  "Missing Alignment column" = list(
+    args = list(
+      .data = subset(test_input[["full_clones"]], select = -c(get("Alignment"))),
+      .threads = 1
+    )
+  )
+)
+
+for (i in seq_along(negative_test_cases)) {
+  # Arrange
+  test_name <- names(negative_test_cases)[i]
+  args <- negative_test_cases[[i]][["args"]]
+
+  # Act, Assert
+  test_that(
+    test_name,
+    expect_error(do.call(repAlignLineage, args))
+  )
+}

From 55cfacc5a6517a652563c9fb00d100c60529d1c0 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Wed, 30 Nov 2022 16:27:26 +0100
Subject: [PATCH 11/18] fix

---
 tests/testthat/test-clonal-family.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-clonal-family.R b/tests/testthat/test-clonal-family.R
index 3903a4b0..8c09a65e 100644
--- a/tests/testthat/test-clonal-family.R
+++ b/tests/testthat/test-clonal-family.R
@@ -110,6 +110,6 @@ for (i in seq_along(negative_test_cases)) {
   # Act, Assert
   test_that(
     test_name,
-    expect_error(do.call(repAlignLineage, args))
+    expect_error(do.call(repClonalFamily, args))
   )
 }

From 91446cfcc082aa140c1ac590c4282a67773d8e65 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Wed, 30 Nov 2022 16:28:35 +0100
Subject: [PATCH 12/18] SHM tests

---
 tests/testthat/test-somatic-hypermutation.R | 95 +++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 tests/testthat/test-somatic-hypermutation.R

diff --git a/tests/testthat/test-somatic-hypermutation.R b/tests/testthat/test-somatic-hypermutation.R
new file mode 100644
index 00000000..74aac8f5
--- /dev/null
+++ b/tests/testthat/test-somatic-hypermutation.R
@@ -0,0 +1,95 @@
+data(bcrdata)
+test_bcr_data <- bcrdata$data %>% top(1000)
+test_input <- test_bcr_data %>%
+  seqCluster(seqDist(test_bcr_data), .fixed_threshold = 3) %>%
+  repGermline(.threads = 1) %>%
+  repAlignLineage(.min_lineage_sequences = 2, .prepare_threads = 1, .align_threads = 1) %>%
+  repClonalFamily(.threads = 1) %>%
+  suppressWarnings()
+
+positive_test_cases <- list(
+  "Not empty result" = list(
+    args = list(
+      .data = test_input,
+      .threads = 1
+    ),
+    assert_function = function(result) {
+      expect_equal(immunarch:::has_no_data(result), FALSE)
+      expect_equal(nrow(result[["full_clones"]]) > 0, TRUE)
+    }
+  ),
+  "Multiple threads" = list(
+    args = list(
+      .data = test_input,
+      .threads = 8
+    ),
+    assert_function = function(result) {
+      expect_equal(immunarch:::has_no_data(result), FALSE)
+      expect_equal(nrow(result[["full_clones"]]) > 0, TRUE)
+    }
+  ),
+  "Dataframe only" = list(
+    args = list(
+      .data = test_input[["full_clones"]],
+      .threads = 1
+    ),
+    assert_function = function(result) {
+      expect_equal(immunarch:::has_no_data(result), FALSE)
+      expect_equal(nrow(result) > 0, TRUE)
+    }
+  )
+)
+
+for (i in seq_along(positive_test_cases)) {
+  # Arrange
+  test_name <- names(positive_test_cases)[i]
+  args <- positive_test_cases[[i]][["args"]]
+  assert_function <- positive_test_cases[[i]][["assert_function"]]
+
+  # Act
+  result <- do.call(repSomaticHypermutation, args)
+
+  # Assert
+  test_that(
+    test_name,
+    assert_function(result)
+  )
+}
+
+negative_test_cases <- list(
+  "List of lists" = list(
+    args = list(
+      .data = bcrdata
+    )
+  ),
+  "Missing columns" = list(
+    args = list(
+      .data = test_bcr_data[["full_clones"]],
+      .threads = 1
+    )
+  ),
+  "Missing Sequences column" = list(
+    args = list(
+      .data = subset(test_input[["full_clones"]], select = -c(get("Sequences"))),
+      .threads = 1
+    )
+  ),
+  "Missing Germline.Input column" = list(
+    args = list(
+      .data = subset(test_input[["full_clones"]], select = -c(get("Germline.Input"))),
+      .threads = 1
+    )
+  )
+)
+
+for (i in seq_along(negative_test_cases)) {
+  # Arrange
+  test_name <- names(negative_test_cases)[i]
+  args <- negative_test_cases[[i]][["args"]]
+
+  # Act, Assert
+  test_that(
+    test_name,
+    expect_error(do.call(repSomaticHypermutation, args))
+  )
+}

From 2c74a621bfbfbe863bdcb3ca1a180a7bbf2332ab Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Tue, 6 Dec 2022 15:57:04 +0100
Subject: [PATCH 13/18] WIP

---
 R/io-parsers.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/io-parsers.R b/R/io-parsers.R
index 6fc99c1f..9747c282 100644
--- a/R/io-parsers.R
+++ b/R/io-parsers.R
@@ -1044,8 +1044,12 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
     .vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene",
     .vend = NA, .jstart = NA, .dstart = NA, .dend = NA,
     .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA,
-    .skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id")
-    .add = c("chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene")
+    .skip = 0, .sep = ",",
+    .add = c(
+      "chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene",
+      "cdr1", "cdr1_aa", "cdr2", "cdr2_aa", "cdr3", "cdr3_aa",
+      "fwr1", "fwr1_aa", "fwr2", "fwr2_aa", "fwr3", "fwr3_aa", "fwr4", "fwr4_aa"
+    )
   )
 
   # Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids

From 1c18fc436af6b1a5d7e0444e84ce26ea50dc5b4c Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Wed, 7 Dec 2022 13:37:17 +0100
Subject: [PATCH 14/18] Implemented CDR and FR parsing from 10x_filt_contigs

---
 R/io-parsers.R | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/R/io-parsers.R b/R/io-parsers.R
index 88bc0fc7..2dbcc167 100644
--- a/R/io-parsers.R
+++ b/R/io-parsers.R
@@ -1047,11 +1047,24 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
     .skip = 0, .sep = ",",
     .add = c(
       "chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene",
-      "cdr1", "cdr1_aa", "cdr2", "cdr2_aa", "cdr3", "cdr3_aa",
-      "fwr1", "fwr1_aa", "fwr2", "fwr2_aa", "fwr3", "fwr3_aa", "fwr4", "fwr4_aa"
+      "cdr1_nt", "cdr1", "cdr2_nt", "cdr2",
+      "fwr1_nt", "fwr1", "fwr2_nt", "fwr2", "fwr3_nt", "fwr3", "fwr4_nt", "fwr4"
     )
   )
 
+  setnames(df, "cdr1_nt", IMMCOL_EXT$cdr1nt)
+  setnames(df, "cdr2_nt", IMMCOL_EXT$cdr2nt)
+  setnames(df, "cdr1", IMMCOL_EXT$cdr1aa)
+  setnames(df, "cdr2", IMMCOL_EXT$cdr2aa)
+  setnames(df, "fwr1_nt", IMMCOL_EXT$fr1nt)
+  setnames(df, "fwr2_nt", IMMCOL_EXT$fr2nt)
+  setnames(df, "fwr3_nt", IMMCOL_EXT$fr3nt)
+  setnames(df, "fwr4_nt", IMMCOL_EXT$fr4nt)
+  setnames(df, "fwr1", IMMCOL_EXT$fr1aa)
+  setnames(df, "fwr2", IMMCOL_EXT$fr2aa)
+  setnames(df, "fwr3", IMMCOL_EXT$fr3aa)
+  setnames(df, "fwr4", IMMCOL_EXT$fr4aa)
+
   # Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids
   df <- df[order(df$chain), ]
   setDT(df)
@@ -1061,8 +1074,20 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
       lazy_dt() %>%
       group_by(barcode, raw_clonotype_id) %>%
       summarise(
+        CDR1.nt = paste0(get("CDR1.nt"), collapse = IMMCOL_ADD$scsep),
+        CDR1.aa = paste0(get("CDR1.aa"), collapse = IMMCOL_ADD$scsep),
+        CDR2.nt = paste0(get("CDR2.nt"), collapse = IMMCOL_ADD$scsep),
+        CDR2.aa = paste0(get("CDR2.aa"), collapse = IMMCOL_ADD$scsep),
         CDR3.nt = paste0(get("CDR3.nt"), collapse = IMMCOL_ADD$scsep),
         CDR3.aa = paste0(get("CDR3.aa"), collapse = IMMCOL_ADD$scsep),
+        FR1.nt = paste0(get("FR1.nt"), collapse = IMMCOL_ADD$scsep),
+        FR1.aa = paste0(get("FR1.aa"), collapse = IMMCOL_ADD$scsep),
+        FR2.nt = paste0(get("FR2.nt"), collapse = IMMCOL_ADD$scsep),
+        FR2.aa = paste0(get("FR2.aa"), collapse = IMMCOL_ADD$scsep),
+        FR3.nt = paste0(get("FR3.nt"), collapse = IMMCOL_ADD$scsep),
+        FR3.aa = paste0(get("FR3.aa"), collapse = IMMCOL_ADD$scsep),
+        FR4.nt = paste0(get("FR4.nt"), collapse = IMMCOL_ADD$scsep),
+        FR4.aa = paste0(get("FR4.aa"), collapse = IMMCOL_ADD$scsep),
         V.name = paste0(get("V.name"), collapse = IMMCOL_ADD$scsep),
         J.name = paste0(get("J.name"), collapse = IMMCOL_ADD$scsep),
         D.name = paste0(get("D.name"), collapse = IMMCOL_ADD$scsep),
@@ -1098,7 +1123,19 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
         paste0(unique(get("raw_clonotype_id")), collapse = IMMCOL_ADD$scsep)
       ),
       contig_id = paste0(get("contig_id"), collapse = IMMCOL_ADD$scsep),
-      c_gene = first(get("c_gene"))
+      c_gene = first(get("c_gene")),
+      CDR1.nt = first(get(IMMCOL_EXT$cdr1nt)),
+      CDR2.nt = first(get(IMMCOL_EXT$cdr2nt)),
+      CDR1.aa = first(get(IMMCOL_EXT$cdr1aa)),
+      CDR2.aa = first(get(IMMCOL_EXT$cdr2aa)),
+      FR1.nt = first(get(IMMCOL_EXT$fr1nt)),
+      FR2.nt = first(get(IMMCOL_EXT$fr2nt)),
+      FR3.nt = first(get(IMMCOL_EXT$fr3nt)),
+      FR4.nt = first(get(IMMCOL_EXT$fr4nt)),
+      FR1.aa = first(get(IMMCOL_EXT$fr1aa)),
+      FR2.aa = first(get(IMMCOL_EXT$fr2aa)),
+      FR3.aa = first(get(IMMCOL_EXT$fr3aa)),
+      FR4.aa = first(get(IMMCOL_EXT$fr4aa))
     ) %>%
     as.data.table() %>%
     subset(

From 8a3e737c9c097007b1c2a85938cae15b83086481 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Wed, 7 Dec 2022 14:13:28 +0100
Subject: [PATCH 15/18] Added FR and CDR columns to AIRR format parser

---
 R/io-parsers.R | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/R/io-parsers.R b/R/io-parsers.R
index 2dbcc167..27679ee8 100644
--- a/R/io-parsers.R
+++ b/R/io-parsers.R
@@ -964,19 +964,26 @@ parse_airr <- function(.filename, .mode) {
     .as_tsv() %>%
     airr::read_rearrangement()
 
-  df <- df %>%
+  df %<>%
     select(
       sequence, v_call, d_call, j_call, junction, junction_aa,
       contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"),
       contains("j_germline_start"), contains("np1_length"), contains("np2_length"),
-      contains("duplicate_count")
+      contains("duplicate_count"),
+      cdr1, cdr2, cdr1_aa, cdr2_aa, fwr1, fwr2, fwr3, fwr4, fwr1_aa, fwr2_aa, fwr3_aa, fwr4_aa
     )
 
   namekey <- c(
     duplicate_count = IMMCOL$count, junction = IMMCOL$cdr3nt, junction_aa = IMMCOL$cdr3aa,
     v_call = IMMCOL$v, d_call = IMMCOL$d, j_call = IMMCOL$j, v_germline_end = IMMCOL$ve,
     d_germline_start = IMMCOL$ds, d_germline_end = IMMCOL$de, j_germline_start = IMMCOL$js,
-    np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq
+    np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq,
+    cdr1 = IMMCOL_EXT$cdr1nt, cdr2 = IMMCOL_EXT$cdr2nt,
+    cdr1_aa = IMMCOL_EXT$cdr1aa, cdr2_aa = IMMCOL_EXT$cdr2aa,
+    fwr1 = IMMCOL_EXT$fr1nt, fwr2 = IMMCOL_EXT$fr2nt,
+    fwr3 = IMMCOL_EXT$fr3nt, fwr4 = IMMCOL_EXT$fr4nt,
+    fwr1_aa = IMMCOL_EXT$fr1aa, fwr2_aa = IMMCOL_EXT$fr2aa,
+    fwr3_aa = IMMCOL_EXT$fr3aa, fwr4_aa = IMMCOL_EXT$fr4aa
   )
 
   names(df) <- namekey[names(df)]
@@ -998,13 +1005,15 @@ parse_airr <- function(.filename, .mode) {
     }
   }
 
-  for (column in IMMCOL$order) {
+  order <- c(IMMCOL$order, IMMCOL_EXT$order[IMMCOL_EXT$order %in% namekey])
+
+  for (column in order) {
     if (!(column %in% colnames(df))) {
       df[column] <- NA
     }
   }
 
-  df <- df[IMMCOL$order]
+  df <- df[order]
   total <- sum(df$Clones)
   df[IMMCOL$prop] <- df[IMMCOL$count] / total
   df[IMMCOL$seq] <- stringr::str_remove_all(df[[IMMCOL$seq]], "N")

From ff59a12a25479d7351a198c8e601e67d47d33878 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Thu, 8 Dec 2022 17:00:19 +0100
Subject: [PATCH 16/18] io_utility bugfixes

---
 R/io-utility.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/io-utility.R b/R/io-utility.R
index a43d4870..e251960e 100644
--- a/R/io-utility.R
+++ b/R/io-utility.R
@@ -76,7 +76,7 @@
 
 
 .make_names <- function(.char) {
-  if (is.na(.char[1])) {
+  if (has_no_data(.char)) {
     NA
   } else {
     tolower(.char)
@@ -136,8 +136,8 @@
     .vend, .jstart, .dstart, .dend,
     .vd.insertions, .dj.insertions, .total.insertions
   ))
-  if (!is.na(.add[1])) {
-    swlist <- c(swlist, rep(col_guess(), length(.add)))
+  if (!has_no_data(.add)) {
+    swlist <- c(swlist, rep(list(col_guess()), length(.add)))
     names(swlist)[tail(seq_along(swlist), length(.add))] <- .add
   }
 

From b44c1a9978ada196ae0831b5bf95b82ac57818ca Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Fri, 9 Dec 2022 12:45:10 +0100
Subject: [PATCH 17/18] Update for BCR pipeline tutorial

---
 vignettes/web_only/BCRpipeline.Rmd | 34 ++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/vignettes/web_only/BCRpipeline.Rmd b/vignettes/web_only/BCRpipeline.Rmd
index 5db7e207..d3e1e404 100644
--- a/vignettes/web_only/BCRpipeline.Rmd
+++ b/vignettes/web_only/BCRpipeline.Rmd
@@ -44,11 +44,9 @@ The pipeline involves five steps:
 
     This step involves preparation for phylogenetic and somatic hypermutation analysis.
 
-
 5. **Phylogenetic analysis.**
 
     This step provides phylogeny reconstruction and trunk length calculation (by running the PHYLIP package).
-    
 
 6. **Somatic hypermutation analysis.**
 
@@ -131,9 +129,9 @@ bcrdata$data %>%
 `.species` - Specifies species from which reference V and J are taken.
 Available species: "HomoSapiens" (default), "MusMusculus", "BosTaurus", "CamelusDromedarius", "CanisLupusFamiliaris", "DanioRerio", "MacacaMulatta", "MusMusculusDomesticus", "MusMusculusCastaneus", "MusMusculusMolossinus", "MusMusculusMusculus", "MusSpretus", "OncorhynchusMykiss", "OrnithorhynchusAnatinus", "OryctolagusCuniculus", "RattusNorvegicus", "SusScrofa".
 
-`.min_nuc_outside_cdr3` — this parameter sets how many nucleotides should have V or J chain outside of CDR3 to be considered good for further alignment. Reads with too short chains are filtered out
+`.min_nuc_outside_cdr3` - This parameter sets how many nucleotides should have V or J chain outside of CDR3 to be considered good for further alignment. Reads with too short chains are filtered out.
 
-`.align_j_gene` - if the germline sequence does not assemble correctly in the region of the J gene, then set this parameter to True. This will slow down the algorithm, but the assembly of the germline sequence will be more accurate.
+`.threads` - The number of threads to use.
 
 # Aligning sequences within a clonal lineage
 
@@ -169,7 +167,7 @@ The function has several parameters:
 # take clusters that contain at least 1 sequence
 bcr_data <- bcrdata$data
 align_dt <- bcr_data %>%
-  seqCluster(seqDist(bcr_data, .col = 'CDR3.nt', .group_by_seqLength = TRUE), 
+  seqCluster(seqDist(bcr_data, .col = 'CDR3.nt', .group_by_seqLength = TRUE),
              .perc_similarity = 0.6) %>%
   repGermline(.threads = 1) %>%
   repAlignLineage(.min_lineage_sequences = 6, .align_threads = 2, .nofail = TRUE)
@@ -219,7 +217,7 @@ sudo apt-get install -y phylip
 repClonalFamily usage example:
 
 ```{r example 10, results = 'hide'}
-bcr <-  align_dt %>%
+bcr <- align_dt %>%
   repClonalFamily(.threads = 2, .nofail = TRUE)
 #plot visualization of the first tree
 vis(bcr[["full_clones"]][["TreeStats"]][[1]])
@@ -242,6 +240,24 @@ f[f$DistanceAA != 0, ]['Type'] = 'mutationAA'
 vis(f)
 ```
 
+Another way to recolor leaves is to use `.vis_groups` parameter for repClonalFamily. It allows to assign group names for specific clone IDs, or lists of clone IDs:
+
+```{r example 10.4, results = 'hide'}
+#get all clone IDs from align_dt
+clone_ids <- unnest(align_dt[["full_clones"]], "Sequences")[["Clone.ID"]]
+#run repClonalFamily with assigning some of these clones to differently named and colored groups
+bcr_with_groups <- align_dt %>%
+  repClonalFamily(.vis_groups = list(
+                    Group1 = clone_ids[1],
+                    Group2 = clone_ids[3],
+                    Group3 = list(clone_ids[5], clone_ids[2]),
+                    Group4 = c(clone_ids[7], clone_ids[4])
+                   ), .threads = 2, .nofail = TRUE
+                 )
+#display the first tree from repClonalFamily results
+vis(bcr_with_groups[["full_clones"]][["TreeStats"]][[1]])
+```
+
 We have found 4 clusters:
 
 ```{r example 11, warning = FALSE}
@@ -324,9 +340,9 @@ shm_data$full_clones[ , cols ]
 Then you could easily estimate the mutation rate:
 
 ```{r example 19}
-# estimate mutation rate 
-shm_data$full_clones %>% 
-  mutate(Mutation.Rate = Mutations / (nchar(Common.Ancestor) - CDR3.germline.length)) %>% 
+# estimate mutation rate
+shm_data$full_clones %>%
+  mutate(Mutation.Rate = Mutations / (nchar(Common.Ancestor) - CDR3.germline.length)) %>%
   select(Clone.ID, Mutation.Rate)
 ```
 

From 5a6adc22adae003d63c3ca5157562220ddbc014f Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <alexander230r@gmail.com>
Date: Tue, 13 Dec 2022 17:32:16 +0100
Subject: [PATCH 18/18] CRAN fixes

---
 DESCRIPTION    |  2 +-
 NAMESPACE      |  3 +++
 R/diversity.R  |  1 +
 R/io-parsers.R | 20 ++++++++++----------
 R/io.R         |  2 +-
 R/tools.R      |  5 +++++
 6 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 1b264a25..bcafa6da 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -84,6 +84,6 @@ Suggests:
     rmarkdown
 VignetteBuilder: knitr
 Encoding: UTF-8
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.2
 LazyData: true
 LazyDataCompression: xz
diff --git a/NAMESPACE b/NAMESPACE
index d6c0a501..6dba8022 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -147,10 +147,12 @@ importFrom(dplyr,group_map)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,n)
+importFrom(dplyr,one_of)
 importFrom(dplyr,pull)
 importFrom(dplyr,rename)
 importFrom(dplyr,rowwise)
 importFrom(dplyr,select)
+importFrom(dplyr,select_)
 importFrom(dplyr,select_if)
 importFrom(dplyr,summarise)
 importFrom(dplyr,tally)
@@ -290,6 +292,7 @@ importFrom(tibble,tibble)
 importFrom(tidyr,drop_na)
 importFrom(tidyr,unite)
 importFrom(tidyr,unnest)
+importFrom(tidyselect,all_of)
 importFrom(tidyselect,any_of)
 importFrom(tidyselect,starts_with)
 importFrom(utils,capture.output)
diff --git a/R/diversity.R b/R/diversity.R
index 8c965cee..0fd027b1 100644
--- a/R/diversity.R
+++ b/R/diversity.R
@@ -14,6 +14,7 @@ if (getRversion() >= "2.15.1") {
 #' @importFrom dplyr mutate group_by_at pull
 #' @importFrom stats qnorm
 #' @importFrom rlang sym
+#' @importFrom tidyselect all_of
 #'
 #' @description
 #' This is a utility function to estimate the diversity of species or objects in the given distribution.
diff --git a/R/io-parsers.R b/R/io-parsers.R
index 27679ee8..d41c53bb 100644
--- a/R/io-parsers.R
+++ b/R/io-parsers.R
@@ -834,8 +834,6 @@ parse_tcr <- function(.filename, .mode) {
 }
 
 parse_vdjtools <- function(.filename, .mode) {
-  skip <- 0
-
   # Check for different VDJtools outputs
   f <- file(.filename, "r")
   l <- readLines(f, 1)
@@ -965,12 +963,14 @@ parse_airr <- function(.filename, .mode) {
     airr::read_rearrangement()
 
   df %<>%
-    select(
-      sequence, v_call, d_call, j_call, junction, junction_aa,
-      contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"),
-      contains("j_germline_start"), contains("np1_length"), contains("np2_length"),
-      contains("duplicate_count"),
-      cdr1, cdr2, cdr1_aa, cdr2_aa, fwr1, fwr2, fwr3, fwr4, fwr1_aa, fwr2_aa, fwr3_aa, fwr4_aa
+    select_(
+      "sequence", "v_call", "d_call", "j_call", "junction", "junction_aa",
+      ~contains("v_germline_end"), ~contains("d_germline_start"),
+      ~contains("d_germline_end"), ~contains("j_germline_start"),
+      ~contains("np1_length"), ~contains("np2_length"),
+      ~contains("duplicate_count"),
+      "cdr1", "cdr2", "cdr1_aa", "cdr2_aa", "fwr1", "fwr2", "fwr3", "fwr4",
+      "fwr1_aa", "fwr2_aa", "fwr3_aa", "fwr4_aa"
     )
 
   namekey <- c(
@@ -1081,7 +1081,7 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
   if (.mode == "paired") {
     df %<>%
       lazy_dt() %>%
-      group_by(barcode, raw_clonotype_id) %>%
+      group_by_colnames("barcode", "raw_clonotype_id") %>%
       summarise(
         CDR1.nt = paste0(get("CDR1.nt"), collapse = IMMCOL_ADD$scsep),
         CDR1.aa = paste0(get("CDR1.aa"), collapse = IMMCOL_ADD$scsep),
@@ -1117,7 +1117,7 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
       V.name.sorted = sort_string(get("V.name"), IMMCOL_ADD$scsep),
       J.name.sorted = sort_string(get("J.name"), IMMCOL_ADD$scsep)
     ) %>%
-    group_by(CDR3.nt.sorted, V.name.sorted, J.name.sorted) %>%
+    group_by_colnames("CDR3.nt.sorted", "V.name.sorted", "J.name.sorted") %>%
     summarise(
       Clones = length(unique(get("barcode"))),
       CDR3.nt = first(get("CDR3.nt")),
diff --git a/R/io.R b/R/io.R
index 0b607537..aebedab0 100644
--- a/R/io.R
+++ b/R/io.R
@@ -20,7 +20,7 @@ if (getRversion() >= "2.15.1") {
 #' @importFrom jsonlite read_json
 #' @importFrom stringr str_split str_detect str_replace_all str_trim
 #' @importFrom methods as
-#' @importFrom dplyr contains first
+#' @importFrom dplyr contains first select_ group_by_at one_of
 #' @importFrom utils read.table
 #' @importFrom data.table setDF
 #'
diff --git a/R/tools.R b/R/tools.R
index b7fc8976..382143c6 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -494,6 +494,11 @@ has_no_data <- function(.data) {
   any(sapply(list(NA, NULL, NaN), identical, .data)) | all(is.na(.data))
 }
 
+# variant of group_by that takes column names as strings
+group_by_colnames <- function(.data, ...) {
+  group_by_at(.data, vars(one_of(...)))
+}
+
 # apply function to .data if it's a single sample or to each sample if .data is a list of samples
 apply_to_sample_or_list <- function(.data, .function, .with_names = FALSE, .validate = TRUE, ...) {
   if (has_no_data(.data)) {