From 128660014444d74eb5b4c4e0d522b8eff4021b6f Mon Sep 17 00:00:00 2001 From: mikessh Date: Thu, 13 Jun 2024 21:32:13 +0300 Subject: [PATCH] WIP & version upd --- LICENSE.txt | 2 +- latest-version.txt | 1 + summary/vdjdb_summary.Rmd | 52 ++++++++++++++++++++++----------------- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index 0dd2c77..906c2b5 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ VDJDB: A curated database of T-cell receptor sequences of known antigen specificity -Copyright 2015-2022 VDJdb Developers +Copyright 2015-2024 VDJdb Developers and Maintainers Attribution-NoDerivatives 4.0 International diff --git a/latest-version.txt b/latest-version.txt index d0d5413..921a78d 100644 --- a/latest-version.txt +++ b/latest-version.txt @@ -1,3 +1,4 @@ +https://github.com/antigenomics/vdjdb-db/releases/download/2024-06-13/vdjdb-2024-06-13.zip https://github.com/antigenomics/vdjdb-db/releases/download/2024-05-23/vdjdb-2024-05-23.zip https://github.com/antigenomics/vdjdb-db/releases/download/2023-06-01/vdjdb-2023-06-01.zip https://github.com/antigenomics/vdjdb-db/releases/download/2022-03-30/vdjdb-2022-03-30.zip diff --git a/summary/vdjdb_summary.Rmd b/summary/vdjdb_summary.Rmd index 2769c79..1a9a3d0 100644 --- a/summary/vdjdb_summary.Rmd +++ b/summary/vdjdb_summary.Rmd @@ -176,10 +176,10 @@ dt.vdjdb.s2 = dt.vdjdb.s %>% mhc_count = length(unique(mhc_key[which(pub_date <= pub_date2)]))) p1=ggplot(dt.vdjdb.s2, aes(x = as.integer(pub_date2), y = tcr_count, color = chains)) + - annotate("segment", x = 2017, xend = 2017, y = 0, yend = 19000, linetype="dotted", size = 0.3) + - annotate("text", x = 2017, y = 21000, label = "AIRR-seq for tet+", hjust = 1, vjust = 1) + - annotate("segment", x = 2019, xend = 2019, y = 0, yend = 27000, linetype="dotted", size = 0.3) + - annotate("text", x = 2019, y = 30000, label = "10X & dCODE", hjust = 1, vjust = 1) + + annotate("segment", x = 2017, xend = 2017, y = 0, yend = 19000, linetype="solid", color = "grey25", size = 0.3) + + annotate("text", x = 2017, y = 21500, label = "AIRR-seq for tet+", hjust = 1, vjust = 1) + + annotate("segment", x = 2019, xend = 2019, y = 0, yend = 27000, linetype="solid", color = "grey25", size = 0.3) + + annotate("text", x = 2019, y = 31000, label = "10X & dCODE", hjust = 1, vjust = 1) + geom_line() + geom_point() + ylab("") + @@ -190,7 +190,7 @@ p1=ggplot(dt.vdjdb.s2, aes(x = as.integer(pub_date2), y = tcr_count, color = cha axis.line = element_line(size = 0.3)) p2=ggplot(dt.vdjdb.s2, aes(x = as.integer(pub_date2), y = epi_count, color = chains)) + - annotate("segment", x = 2021, xend = 2021, y = 0, yend = 1000, linetype="dotted", size = 0.3) + + annotate("segment", x = 2021, xend = 2021, y = 0, yend = 1000, linetype="solid", color = "grey25", size = 0.3) + annotate("text", x = 2021, y = 1150, label = "COVID-19 studies", hjust = 1, vjust = 1) + geom_line() + geom_point() + @@ -274,10 +274,12 @@ Summary of antigens and T-cell receptors related to COVID-19 pandemic. Number of df %>% filter(species == "HomoSapiens", startsWith(as.character(antigen.species), "SARS-CoV")) %>% - mutate(mhc.a = str_split_fixed(mhc.a, "[,:]", 2)[,1]) %>% - group_by(antigen.gene, mhc.a, antigen.epitope) %>% + mutate(mhc.a = str_split_fixed(mhc.a, "[,:]", 2)[,1], + mhc.b = str_split_fixed(mhc.b, "[,:]", 2)[,1], + mhc = ifelse(mhc.class == "MHCI", mhc.a, paste0(mhc.a, '/', substr(mhc.b, 7, 15)))) %>% + group_by(antigen.gene, mhc, antigen.epitope) %>% mutate(publications = length(unique(str_split_fixed(reference.id, ",", n = Inf)[,1]))) %>% - group_by(antigen.gene, mhc.a, antigen.epitope, gene, publications) %>% + group_by(antigen.gene, mhc, antigen.epitope, gene, publications) %>% summarize(records = n()) -> df.c colnames(df.c) = c("Gene", "HLA", "Epitope", "TCR chain", @@ -293,7 +295,7 @@ ggplot(df.c %>% y = log2(Records))) + geom_alluvium(aes(fill = substr(Epitope,1,3) %>% as.factor %>% as.integer), color = "white", alpha = 0.8, curve_type = "sigmoid") + - geom_stratum(fill = "white", color = "black", size=0.2) + + geom_stratum(fill = "grey95", color = "white", size=1.0) + geom_text(stat = "stratum", aes(label = after_stat(stratum))) + scale_fill_distiller(palette = "Set3", guide=F, "") + #scale_fill_hue(guide=F, "") + @@ -303,35 +305,38 @@ ggplot(df.c %>% theme_void() + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), - axis.text.x = element_text(size = 16, color = "black"), + axis.text.x = element_text(size = 16, color = "black", vjust = -5), axis.ticks.x = element_blank(), - panel.grid.major.y = element_blank(), - legend.position = "bottom") + panel.grid.major.y = element_blank()) ``` Summary of SARS-CoV-2 epitopes and corresponding TCR alpha and beta chain specificity records (cases with 10+ records) ```{r message=FALSE, warning=FALSE} kable(format = "html", - df.c %>% reshape2::dcast(Gene + HLA + Epitope + Studies ~ `TCR chain`, fill = 0) %>% + df.c %>% + reshape2::dcast(Gene + HLA + Epitope + Studies ~ `TCR chain`, fill = 0) %>% + mutate(HLA = gsub("*", ".", HLA, fixed = T)) %>% filter(TRA+TRB >= 10) %>% arrange(-(TRB+TRA))) ``` --- -#### **Neoantigen** data +#### **Self-antigen** data -Summary of potential neoantigen targets for immunotherapy and T-cell receptors recognizing them. Number of records for neoantigens grouped by mutated gene and HLA plotted using alluvium plot. Neoantigens with less than 10 records in total were not counted. +Summary of T-cell receptors recognizing self-antigens, including antigens linked to utoimmune diseases and potential neoantigen targets for cancer immunotherapy. Number of records for self-antigens grouped by (mutated) human gene and corresponding HLAs are plotted using alluvium plot. Only self-antigens with at least 10 records are shown. ```{r message=FALSE, warning=FALSE, fig.width=12, fig.height=10} df %>% filter(species == "HomoSapiens", startsWith(as.character(antigen.species), "HomoSapiens")) %>% - mutate(mhc.a = str_split_fixed(mhc.a, "[,:]", 2)[,1]) %>% - group_by(antigen.gene, mhc.a, antigen.epitope) %>% + mutate(mhc.a = str_split_fixed(mhc.a, "[,:]", 2)[,1], + mhc.b = str_split_fixed(mhc.b, "[,:]", 2)[,1], + mhc = ifelse(mhc.class == "MHCI", mhc.a, paste0(mhc.a, '/', substr(mhc.b, 7, 15)))) %>% + group_by(antigen.gene, mhc, antigen.epitope) %>% mutate(publications = length(unique(str_split_fixed(reference.id, ",", n = Inf)[,1]))) %>% - group_by(antigen.gene, mhc.a, antigen.epitope, gene, publications) %>% + group_by(antigen.gene, mhc, antigen.epitope, gene, publications) %>% summarize(records = n()) -> df.n colnames(df.n) = c("Gene", "HLA", "Epitope", "TCR chain", @@ -347,7 +352,7 @@ ggplot(df.n %>% y = log2(Records))) + geom_alluvium(aes(fill = substr(Epitope,1,3) %>% as.factor %>% as.integer), color = "white", alpha = 0.8, curve_type = "sigmoid") + - geom_stratum(fill = "white", color = "black", size=0.2) + + geom_stratum(fill = "grey95", color = "white", size=1.0) + geom_text(stat = "stratum", aes(label = after_stat(stratum))) + scale_fill_distiller(palette = "Accent", guide=F, "") + #scale_fill_hue(guide=F, "") + @@ -357,17 +362,18 @@ ggplot(df.n %>% theme_void() + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), - axis.text.x = element_text(size = 16, color = "black"), + axis.text.x = element_text(size = 16, color = "black", vjust = -5), axis.ticks.x = element_blank(), - panel.grid.major.y = element_blank(), - legend.position = "bottom") + panel.grid.major.y = element_blank()) ``` Summary of neoantigens and corresponding TCR alpha and beta chain specificity records (cases with 5+ records) ```{r message=FALSE, warning=FALSE} kable(format = "html", - df.n %>% reshape2::dcast(Gene + HLA + Epitope + Studies ~ `TCR chain`, fill = 0) %>% + df.n %>% + reshape2::dcast(Gene + HLA + Epitope + Studies ~ `TCR chain`, fill = 0) %>% + mutate(HLA = gsub("*", ".", HLA, fixed = T)) %>% filter(TRA+TRB >= 5) %>% arrange(-(TRB+TRA))) ```