Skip to content

Commit

Permalink
Merge pull request #270 from immunomind/feature/tree-visualization
Browse files Browse the repository at this point in the history
Clonal tree visualization
  • Loading branch information
Aleksandr Popov authored Aug 11, 2022
2 parents 02dd729 + 3adbd76 commit afbd98d
Show file tree
Hide file tree
Showing 18 changed files with 505 additions and 153 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ Imports:
glue,
phangorn,
uuid,
stringi
stringi,
ggraph
Depends:
R (>= 4.0.0),
ggplot2 (>= 3.1.0),
Expand Down
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Generated by roxygen2: do not edit by hand

S3method(vis,clonal_family)
S3method(vis,clonal_family_tree)
S3method(vis,immunr_chao1)
S3method(vis,immunr_clonal_prop)
S3method(vis,immunr_dbscan)
Expand Down Expand Up @@ -35,6 +37,7 @@ S3method(vis,immunr_spectr)
S3method(vis,immunr_spectr_nogene)
S3method(vis,immunr_top_prop)
S3method(vis,immunr_tsne)
S3method(vis,step_failure_ignored)
export(apply_asymm)
export(apply_symm)
export(bunch_translate)
Expand Down Expand Up @@ -168,6 +171,10 @@ importFrom(ggpubr,ggscatter)
importFrom(ggpubr,rotate_x_text)
importFrom(ggpubr,stat_compare_means)
importFrom(ggpubr,theme_pubr)
importFrom(ggraph,geom_edge_diagonal)
importFrom(ggraph,geom_node_point)
importFrom(ggraph,ggraph)
importFrom(ggraph,theme_graph)
importFrom(ggseqlogo,geom_logo)
importFrom(ggseqlogo,theme_logo)
importFrom(glue,glue)
Expand Down Expand Up @@ -279,6 +286,7 @@ importFrom(stringr,str_sub)
importFrom(stringr,str_trim)
importFrom(tibble,rownames_to_column)
importFrom(tibble,tibble)
importFrom(tidyr,drop_na)
importFrom(tidyr,unite)
importFrom(tidyr,unnest)
importFrom(tidyselect,starts_with)
Expand Down
35 changes: 20 additions & 15 deletions R/align_lineage.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
#' This function aligns all sequences (incliding germline) that belong to one clonal lineage and one cluster.
#' After clustering and building the clonal lineage and germline, the next step is to analyze the degree of mutation
#' and maturity of each clonal lineage. This allows for finding high mature cells and cells with a large
#' number of offspring. The phylogenetic analysis will find mutations that increase the affinity of BCR.
#' Making alignment of the sequence is the first step towards sequence analysis including BCR.
#' Aligns all sequences incliding germline within each clonal lineage within each cluster
#'
#' @concept align_lineage
#'
Expand All @@ -18,7 +14,12 @@
#' @importFrom doParallel registerDoParallel stopImplicitCluster
#' @importFrom parallel mclapply

#' @description Aligns all sequences incliding germline within each clonal lineage within each cluster
#' @description This function aligns all sequences (incliding germline) that belong to one clonal
#' lineage and one cluster. After clustering and building the clonal lineage and germline, the next
#' step is to analyze the degree of mutation and maturity of each clonal lineage. This allows for
#' finding high mature cells and cells with a large number of offspring. The phylogenetic analysis
#' will find mutations that increase the affinity of BCR. Making alignment of the sequence
#' is the first step towards sequence analysis including BCR.
#'
#' @usage
#'
Expand Down Expand Up @@ -62,9 +63,9 @@
#' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage
#' (.min_lineage_sequences is below the threshold); TRUE if it was aligned
#' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE)
#' * V.length (included if .verbose_output=TRUE): shortest length of V gene part outside of CDR3 region in this
#' * V.length: shortest length of V gene part outside of CDR3 region in this
#' group of sequences; longer V genes (including germline) are trimmed to this length before alignment
#' * J.length (included if .verbose_output=TRUE): shortest length of J gene part outside of CDR3 region in this
#' * J.length: shortest length of J gene part outside of CDR3 region in this
#' group of sequences; longer J genes (including germline) are trimmed to this length before alignment
#' * Sequences: nested dataframe containing all sequences for this combination
#' of cluster and germline; it has columns
Expand All @@ -79,7 +80,7 @@
#'
#' bcr_data %>%
#' seqCluster(seqDist(bcr_data), .fixed_threshold = 3) %>%
#' repGermline(.threads = 2) %>%
#' repGermline(.threads = 1) %>%
#' repAlignLineage(.min_lineage_sequences = 2, .align_threads = 2, .nofail = TRUE)
#' @export repAlignLineage
repAlignLineage <- function(.data,
Expand All @@ -93,7 +94,7 @@ repAlignLineage <- function(.data,
"Please download it from here: http://www.clustal.org/download/current/\n",
"or install it with your system package manager (such as apt or dnf)."
), .nofail)) {
return(NA)
return(get_empty_object_with_class("step_failure_ignored"))
}

doParallel::registerDoParallel(cores = .prepare_threads)
Expand Down Expand Up @@ -190,9 +191,8 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
sequences[["Clone.ID"]] %<>% as.integer()
sequences[["Clones"]] %<>% as.integer()

germline_parts <- strsplit(germline_seq, "N")[[1]]
germline_v_len <- stringr::str_length(germline_parts[1])
germline_j_len <- stringr::str_length(tail(germline_parts, 1))
germline_v_len <- str_length(germline_v)
germline_j_len <- str_length(germline_j)
v_min_len <- min(lineage_subset[["V.lengths"]], germline_v_len)
j_min_len <- min(lineage_subset[["J.lengths"]], germline_j_len)

Expand Down Expand Up @@ -233,14 +233,16 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose
J.germline.nt = germline_j,
CDR3.germline.length = germline_cdr3_len,
Alignment = alignment,
V.length = v_min_len,
J.length = j_min_len,
Sequences = sequences
))
}
}

# trim V/J tails in sequence to the specified lenghts v_min, j_min
trim_seq <- function(seq, v_len, v_min, j_len, j_min) {
stringr::str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
}

convert_results_to_df <- function(nested_results_list, nested_alignments_list) {
Expand All @@ -254,7 +256,10 @@ convert_results_to_df <- function(nested_results_list, nested_alignments_list) {
lapply(rlist::list.remove, c("Alignment", "Sequences")) %>%
purrr::map_dfr(~.) %>%
cbind(alignments, sequences)
df[["CDR3.germline.length"]] %<>% as.integer()
# fix column types after dataframe rebuilding
for (column in c("CDR3.germline.length", "V.length", "J.length")) {
df[[column]] %<>% as.integer()
}
return(df)
}

Expand Down
76 changes: 43 additions & 33 deletions R/germline.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
#' This function creates germlines for clonal lineages. B cell clonal lineage represents a set of B cells
#' that presumably have a common origin (arising from the same VDJ rearrangement event) and a common ancestor.
#' Each clonal lineage has its own germline sequence that represents the ancestral sequence
#' for each BCR in clonal lineage. In other words, germline sequence is a sequence of B-cells immediately
#' after VDJ recombination, before B-cell maturation and hypermutation process. Germline sequence is useful
#' for assessing the degree of mutation and maturity of the repertoire.
#' Creates germlines for clonal lineages
#'
#' @concept germline
#'
Expand All @@ -16,7 +11,13 @@
#' @importFrom parallel parApply detectCores makeCluster clusterExport stopCluster
#' @importFrom ape as.DNAbin clustal
#'
#' @description Creates germlines for clonal lineages
#' @description This function creates germlines for clonal lineages. B cell clonal lineage
#' represents a set of B cells that presumably have a common origin (arising from the same VDJ
#' rearrangement event) and a common ancestor. Each clonal lineage has its own germline sequence
#' that represents the ancestral sequence for each BCR in clonal lineage. In other words,
#' germline sequence is a sequence of B-cells immediately after VDJ recombination, before
#' B-cell maturation and hypermutation process. Germline sequence is useful for assessing
#' the degree of mutation and maturity of the repertoire.
#'
#' @usage
#'
Expand Down Expand Up @@ -137,14 +138,16 @@ calculate_germlines_parallel <- function(data, align_j_gene, threads, sample_nam
seq = row[["Sequence"]],
v_ref = row[["V.ref.nt"]],
j_ref = row[["J.ref.nt"]],
v_end = str_length(row[["CDR1.nt"]]) + str_length(row[["CDR2.nt"]])
+ str_length(row[["FR1.nt"]]) + str_length(row[["FR2.nt"]])
+ str_length(row[["FR3.nt"]]),
cdr1_nt = row[["CDR1.nt"]],
cdr2_nt = row[["CDR2.nt"]],
fr1_nt = row[["FR1.nt"]],
fr2_nt = row[["FR2.nt"]],
fr3_nt = row[["FR3.nt"]],
fr4_nt = row[["FR4.nt"]],
cdr3_start = as.integer(row[["CDR3.start"]]),
cdr3_end = as.integer(row[["CDR3.end"]]),
j_start = as.integer(row[["J.start"]]),
j3_del = as.integer(row[["J3.Deletions"]]),
fr4_seq = row[["FR4.nt"]],
align_j_gene = align_j_gene,
sample_name = sample_name
)
Expand All @@ -157,43 +160,47 @@ calculate_germlines_parallel <- function(data, align_j_gene, threads, sample_nam
return(data)
}

generate_germline_sequence <- function(seq,
v_ref,
j_ref,
v_end,
cdr3_start,
cdr3_end,
j_start,
j3_del,
fr4_seq,
align_j_gene,
sample_name) {
if (any(is.na(c(seq, v_ref, j_ref, v_end, cdr3_start, cdr3_end, j_start, j3_del, fr4_seq))) ||
generate_germline_sequence <- function(seq, v_ref, j_ref, cdr1_nt, cdr2_nt,
fr1_nt, fr2_nt, fr3_nt, fr4_nt,
cdr3_start, cdr3_end, j_start, j3_del,
align_j_gene, sample_name) {
if (any(is.na(c(
seq, v_ref, j_ref, cdr1_nt, cdr2_nt, fr1_nt, fr2_nt, fr3_nt, fr4_nt,
cdr3_start, cdr3_end, j_start, j3_del
))) ||
(seq == "")) {
# warnings cannot be displayed from parApply; save them and display after finish
warn <- paste0(
"Some of mandatory fields in a row ",
optional_sample("from sample ", sample_name, " "),
"contain unexpected NA or empty strings! Found values:\n",
"Sequence = \"",
"Sequence = ",
seq,
"\",\nV.ref.nt = \"",
",\nV.ref.nt = ",
v_ref,
"\",\nJ.ref.nt = \"",
",\nJ.ref.nt = ",
j_ref,
"\",\nCalculated_V_end = ",
v_end,
", CDR3.start = ",
",\nCDR1.nt = ",
cdr1_nt,
",\nCDR2.nt = ",
cdr2_nt,
",\nFR1.nt = ",
fr1_nt,
",\nFR2.nt = ",
fr2_nt,
",\nFR3.nt = ",
fr3_nt,
",\nFR4.nt = ",
fr4_nt,
",\nCDR3.start = ",
cdr3_start,
", CDR3.end = ",
cdr3_end,
", J.start = ",
j_start,
", J3.Deletions = ",
j3_del,
",\nFR4.nt = \"",
fr4_seq,
"\".\nThe row will be dropped!"
".\nThe row will be dropped!"
)
return(list(
V.germline.nt = NA,
Expand All @@ -203,6 +210,8 @@ generate_germline_sequence <- function(seq,
Warning = warn
))
} else {
v_end <- str_length(cdr1_nt) + str_length(cdr2_nt) +
str_length(fr1_nt) + str_length(fr2_nt) + str_length(fr3_nt)
cdr3_length <- cdr3_end - cdr3_start

# trim intersection of V and CDR3 from reference V gene
Expand All @@ -212,7 +221,7 @@ generate_germline_sequence <- function(seq,

# trim intersection of J and CDR3 from reference J gene
if (align_j_gene) {
calculated_j_start <- align_and_find_j_start(j_ref, fr4_seq)
calculated_j_start <- align_and_find_j_start(j_ref, fr4_nt)
} else {
calculated_j_start <- max(0, cdr3_end - j_start - j3_del + 1)
}
Expand Down Expand Up @@ -402,6 +411,7 @@ align_and_find_j_start <- function(j_ref, fr4_seq, max_len_diff = 10) {
germline_handle_warnings <- function(df) {
warnings <- df$Warning
warnings <- warnings[!is.na(warnings)]
options(warning.length = 5000L)
for (warn in warnings) {
warning(warn)
}
Expand Down
2 changes: 1 addition & 1 deletion R/io-parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ parse_repertoire <- function(.filename, .mode, .nuc.seq, .aa.seq, .count,
)

# IO_REFACTOR
suppressMessages(df <- readr::read_delim(.filename,
df <- quiet(readr::read_delim(.filename,
col_names = TRUE,
col_types = col.classes, delim = .sep,
quote = "", escape_double = FALSE,
Expand Down
Loading

0 comments on commit afbd98d

Please sign in to comment.