From 128660014444d74eb5b4c4e0d522b8eff4021b6f Mon Sep 17 00:00:00 2001
From: mikessh <mikhail.shugay@gmail.com>
Date: Thu, 13 Jun 2024 21:32:13 +0300
Subject: [PATCH] WIP & version upd

---
 LICENSE.txt               |  2 +-
 latest-version.txt        |  1 +
 summary/vdjdb_summary.Rmd | 52 ++++++++++++++++++++++-----------------
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index 0dd2c77..906c2b5 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,6 +1,6 @@
 VDJDB: A curated database of T-cell receptor sequences of known antigen specificity
 
-Copyright 2015-2022 VDJdb Developers
+Copyright 2015-2024 VDJdb Developers and Maintainers
 
 Attribution-NoDerivatives 4.0 International
 
diff --git a/latest-version.txt b/latest-version.txt
index d0d5413..921a78d 100644
--- a/latest-version.txt
+++ b/latest-version.txt
@@ -1,3 +1,4 @@
+https://github.com/antigenomics/vdjdb-db/releases/download/2024-06-13/vdjdb-2024-06-13.zip
 https://github.com/antigenomics/vdjdb-db/releases/download/2024-05-23/vdjdb-2024-05-23.zip
 https://github.com/antigenomics/vdjdb-db/releases/download/2023-06-01/vdjdb-2023-06-01.zip
 https://github.com/antigenomics/vdjdb-db/releases/download/2022-03-30/vdjdb-2022-03-30.zip
diff --git a/summary/vdjdb_summary.Rmd b/summary/vdjdb_summary.Rmd
index 2769c79..1a9a3d0 100644
--- a/summary/vdjdb_summary.Rmd
+++ b/summary/vdjdb_summary.Rmd
@@ -176,10 +176,10 @@ dt.vdjdb.s2 = dt.vdjdb.s %>%
             mhc_count = length(unique(mhc_key[which(pub_date <= pub_date2)])))
 
 p1=ggplot(dt.vdjdb.s2, aes(x = as.integer(pub_date2), y = tcr_count, color = chains)) +
-  annotate("segment", x = 2017, xend = 2017, y = 0, yend = 19000, linetype="dotted", size = 0.3) +
-  annotate("text", x = 2017, y = 21000, label = "AIRR-seq for tet+", hjust = 1, vjust = 1) +
-  annotate("segment", x = 2019, xend = 2019, y = 0, yend = 27000, linetype="dotted", size = 0.3) +
-  annotate("text", x = 2019, y = 30000, label = "10X & dCODE", hjust = 1, vjust = 1) +
+  annotate("segment", x = 2017, xend = 2017, y = 0, yend = 19000, linetype="solid", color = "grey25", size = 0.3) +
+  annotate("text", x = 2017, y = 21500, label = "AIRR-seq for tet+", hjust = 1, vjust = 1) +
+  annotate("segment", x = 2019, xend = 2019, y = 0, yend = 27000, linetype="solid", color = "grey25", size = 0.3) +
+  annotate("text", x = 2019, y = 31000, label = "10X & dCODE", hjust = 1, vjust = 1) +
   geom_line() +
   geom_point() +
   ylab("") +
@@ -190,7 +190,7 @@ p1=ggplot(dt.vdjdb.s2, aes(x = as.integer(pub_date2), y = tcr_count, color = cha
         axis.line = element_line(size = 0.3))
 
 p2=ggplot(dt.vdjdb.s2, aes(x = as.integer(pub_date2), y = epi_count, color = chains)) +
-  annotate("segment", x = 2021, xend = 2021, y = 0, yend = 1000, linetype="dotted", size = 0.3) +
+  annotate("segment", x = 2021, xend = 2021, y = 0, yend = 1000, linetype="solid", color = "grey25", size = 0.3) +
   annotate("text", x = 2021, y = 1150, label = "COVID-19 studies", hjust = 1, vjust = 1) +
   geom_line() +
   geom_point() +
@@ -274,10 +274,12 @@ Summary of antigens and T-cell receptors related to COVID-19 pandemic. Number of
 df %>% 
   filter(species == "HomoSapiens", 
          startsWith(as.character(antigen.species), "SARS-CoV")) %>%
-  mutate(mhc.a = str_split_fixed(mhc.a, "[,:]", 2)[,1]) %>%
-  group_by(antigen.gene, mhc.a, antigen.epitope) %>%
+  mutate(mhc.a = str_split_fixed(mhc.a, "[,:]", 2)[,1],
+         mhc.b = str_split_fixed(mhc.b, "[,:]", 2)[,1],
+         mhc = ifelse(mhc.class == "MHCI", mhc.a, paste0(mhc.a, '/', substr(mhc.b, 7, 15)))) %>%
+  group_by(antigen.gene, mhc, antigen.epitope) %>%
   mutate(publications = length(unique(str_split_fixed(reference.id, ",", n = Inf)[,1]))) %>%
-  group_by(antigen.gene, mhc.a, antigen.epitope, gene, publications) %>%
+  group_by(antigen.gene, mhc, antigen.epitope, gene, publications) %>%
   summarize(records = n()) -> df.c
 
 colnames(df.c) = c("Gene", "HLA", "Epitope", "TCR chain",
@@ -293,7 +295,7 @@ ggplot(df.c %>%
            y = log2(Records))) +
   geom_alluvium(aes(fill = substr(Epitope,1,3) %>% as.factor %>% as.integer), 
                 color = "white", alpha = 0.8, curve_type = "sigmoid") +
-  geom_stratum(fill = "white", color = "black", size=0.2) +
+  geom_stratum(fill = "grey95", color = "white", size=1.0) +
   geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
   scale_fill_distiller(palette = "Set3", guide=F, "") +
   #scale_fill_hue(guide=F, "") +
@@ -303,35 +305,38 @@ ggplot(df.c %>%
   theme_void() +
   theme(axis.text.y = element_blank(),
         axis.ticks.y = element_blank(),
-        axis.text.x =  element_text(size = 16, color = "black"),
+        axis.text.x =  element_text(size = 16, color = "black", vjust = -5),
         axis.ticks.x = element_blank(),
-        panel.grid.major.y = element_blank(),
-        legend.position = "bottom")
+        panel.grid.major.y = element_blank())
 ```
 
 Summary of SARS-CoV-2 epitopes and corresponding TCR alpha and beta chain specificity records (cases with 10+ records)
 
 ```{r message=FALSE, warning=FALSE}
 kable(format = "html", 
-      df.c %>% reshape2::dcast(Gene + HLA + Epitope + Studies ~ `TCR chain`, fill = 0) %>%
+      df.c %>% 
+        reshape2::dcast(Gene + HLA + Epitope + Studies ~ `TCR chain`, fill = 0) %>%
+        mutate(HLA = gsub("*", ".", HLA, fixed = T)) %>%
         filter(TRA+TRB >= 10) %>%
         arrange(-(TRB+TRA)))
 ```
 
 ---
 
-#### **Neoantigen** data
+#### **Self-antigen** data
 
-Summary of potential neoantigen targets for immunotherapy and T-cell receptors recognizing them. Number of records for neoantigens grouped by mutated gene and HLA plotted using alluvium plot. Neoantigens with less than 10 records in total were not counted.
+Summary of T-cell receptors recognizing self-antigens, including antigens linked to utoimmune diseases and potential neoantigen targets for cancer immunotherapy. Number of records for self-antigens grouped by (mutated) human gene and corresponding HLAs are plotted using alluvium plot. Only self-antigens with at least 10 records are shown.
 
 ```{r message=FALSE, warning=FALSE, fig.width=12, fig.height=10}
 df %>% 
   filter(species == "HomoSapiens", 
          startsWith(as.character(antigen.species), "HomoSapiens")) %>%
-  mutate(mhc.a = str_split_fixed(mhc.a, "[,:]", 2)[,1]) %>%
-  group_by(antigen.gene, mhc.a, antigen.epitope) %>%
+  mutate(mhc.a = str_split_fixed(mhc.a, "[,:]", 2)[,1],
+         mhc.b = str_split_fixed(mhc.b, "[,:]", 2)[,1],
+         mhc = ifelse(mhc.class == "MHCI", mhc.a, paste0(mhc.a, '/', substr(mhc.b, 7, 15)))) %>%
+  group_by(antigen.gene, mhc, antigen.epitope) %>%
   mutate(publications = length(unique(str_split_fixed(reference.id, ",", n = Inf)[,1]))) %>%
-  group_by(antigen.gene, mhc.a, antigen.epitope, gene, publications) %>%
+  group_by(antigen.gene, mhc, antigen.epitope, gene, publications) %>%
   summarize(records = n()) -> df.n
 
 colnames(df.n) = c("Gene", "HLA", "Epitope", "TCR chain",
@@ -347,7 +352,7 @@ ggplot(df.n %>%
            y = log2(Records))) +
   geom_alluvium(aes(fill = substr(Epitope,1,3) %>% as.factor %>% as.integer), 
                 color = "white", alpha = 0.8, curve_type = "sigmoid") +
-  geom_stratum(fill = "white", color = "black", size=0.2) +
+  geom_stratum(fill = "grey95", color = "white", size=1.0) +
   geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
   scale_fill_distiller(palette = "Accent", guide=F, "") +
   #scale_fill_hue(guide=F, "") +
@@ -357,17 +362,18 @@ ggplot(df.n %>%
   theme_void() +
   theme(axis.text.y = element_blank(),
         axis.ticks.y = element_blank(),
-        axis.text.x =  element_text(size = 16, color = "black"),
+        axis.text.x =  element_text(size = 16, color = "black", vjust = -5),
         axis.ticks.x = element_blank(),
-        panel.grid.major.y = element_blank(),
-        legend.position = "bottom")
+        panel.grid.major.y = element_blank())
 ```
 
 Summary of neoantigens and corresponding TCR alpha and beta chain specificity records (cases with 5+ records)
 
 ```{r message=FALSE, warning=FALSE}
 kable(format = "html", 
-      df.n %>% reshape2::dcast(Gene + HLA + Epitope + Studies ~ `TCR chain`, fill = 0) %>%
+      df.n %>% 
+        reshape2::dcast(Gene + HLA + Epitope + Studies ~ `TCR chain`, fill = 0) %>%
+        mutate(HLA = gsub("*", ".", HLA, fixed = T)) %>%
         filter(TRA+TRB >= 5) %>%
         arrange(-(TRB+TRA)))
 ```