From a8465113859ee5ee35d343386be112d08940b02e Mon Sep 17 00:00:00 2001 From: LuppovDaniil Date: Sun, 15 Sep 2024 22:57:39 +0300 Subject: [PATCH] #389 alleles checker added --- py_src/ChunkQC.py | 18 +++++++++++++++++- py_src/runBuidDatabase.py | 14 ++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/py_src/ChunkQC.py b/py_src/ChunkQC.py index 299e5c0..3aea88a 100644 --- a/py_src/ChunkQC.py +++ b/py_src/ChunkQC.py @@ -2,7 +2,9 @@ import pandas as pd from collections import defaultdict -nomenclature_set = set(pd.read_csv('../patches/IGM_nomenclature_table.tsv', sep='\t')['IMGT/GENE-DB']) +nomenclature_df = pd.read_csv('../patches/IGM_nomenclature_table.tsv', sep='\t') +nomenclature_set = set(nomenclature_df['IMGT/GENE-DB']) +alleles_dict = nomenclature_df.set_index('IMGT/GENE-DB')['Number of alleles'].to_dict() COMPLEX_COLUMNS = [ "cdr3.alpha", @@ -181,3 +183,17 @@ def gene_match_check(gene_name: str) -> bool: return gene_name in nomenclature_set return True + +def alleles_match_check(gene_name: str) -> bool: + if isinstance(gene_name, str): + gene = gene_name.split('*')[0] + if len(gene_name.split('*')) > 1: + allele = gene_name.split('*')[1] + else: + return True + if gene not in nomenclature_set: + return True + else: + return int(allele) <= alleles_dict[gene] + else: + return True diff --git a/py_src/runBuidDatabase.py b/py_src/runBuidDatabase.py index dd4e369..a341548 100644 --- a/py_src/runBuidDatabase.py +++ b/py_src/runBuidDatabase.py @@ -5,7 +5,7 @@ from termcolor import cprint -from ChunkQC import ChunkQC, ALL_COLS, gene_match_check +from ChunkQC import ChunkQC, ALL_COLS, gene_match_check, alleles_match_check from Cdr3Fixer import Cdr3Fixer from DefaultDBGenerator import generate_default_db from SlimDBGenerator import generate_slim_db @@ -84,16 +84,22 @@ master_table.to_pickle('../database/vdjdb_full.pkl', ) mask_gene_list = [] + mask_alleles_list = [] for gene in ['alpha', 'beta']: mask_gene_list.append(master_table[f'v.{gene}'].apply(gene_match_check)) mask_gene_list.append(master_table[f'j.{gene}'].apply(gene_match_check)) - final_mask = mask_gene_list[0] + mask_alleles_list.append(master_table[f'v.{gene}'].apply(alleles_match_check)) + mask_alleles_list.append(master_table[f'j.{gene}'].apply(alleles_match_check)) - for mask in mask_gene_list[1:]: + final_mask = mask_gene_list[0] + final_mask_alleles = mask_alleles_list[0] + for mask, allele_mask in zip(mask_gene_list[1:], mask_alleles_list[1:]): final_mask = final_mask & mask + final_mask_alleles = final_mask_alleles & allele_mask - master_table.loc[final_mask].set_index('cdr3.alpha').to_csv('../database/vdjdb_full_gene_clear.txt', sep='\t') + master_table.loc[final_mask & final_mask_alleles].set_index('cdr3.alpha').to_csv('../database/vdjdb_full_gene_clear.txt', sep='\t') master_table.loc[~final_mask].set_index('cdr3.alpha').to_csv('../database/vdjdb_full_gene_broken.txt', sep='\t') + master_table.loc[~final_mask_alleles].set_index('cdr3.alpha').to_csv('../database/vdjdb_full_allele_broken.txt', sep='\t') cprint("Generating and writing default database", 'magenta') default_db = generate_default_db(master_table)