Skip to content

Commit

Permalink
#389 alleles checker added
Browse files Browse the repository at this point in the history
  • Loading branch information
LuppovDaniil committed Sep 15, 2024
1 parent a548187 commit a846511
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
18 changes: 17 additions & 1 deletion py_src/ChunkQC.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import pandas as pd
from collections import defaultdict

nomenclature_set = set(pd.read_csv('../patches/IGM_nomenclature_table.tsv', sep='\t')['IMGT/GENE-DB'])
nomenclature_df = pd.read_csv('../patches/IGM_nomenclature_table.tsv', sep='\t')
nomenclature_set = set(nomenclature_df['IMGT/GENE-DB'])
alleles_dict = nomenclature_df.set_index('IMGT/GENE-DB')['Number of alleles'].to_dict()

COMPLEX_COLUMNS = [
"cdr3.alpha",
Expand Down Expand Up @@ -181,3 +183,17 @@ def gene_match_check(gene_name: str) -> bool:
return gene_name in nomenclature_set
return True


def alleles_match_check(gene_name: str) -> bool:
if isinstance(gene_name, str):
gene = gene_name.split('*')[0]
if len(gene_name.split('*')) > 1:
allele = gene_name.split('*')[1]
else:
return True
if gene not in nomenclature_set:
return True
else:
return int(allele) <= alleles_dict[gene]
else:
return True
14 changes: 10 additions & 4 deletions py_src/runBuidDatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from termcolor import cprint


from ChunkQC import ChunkQC, ALL_COLS, gene_match_check
from ChunkQC import ChunkQC, ALL_COLS, gene_match_check, alleles_match_check
from Cdr3Fixer import Cdr3Fixer
from DefaultDBGenerator import generate_default_db
from SlimDBGenerator import generate_slim_db
Expand Down Expand Up @@ -84,16 +84,22 @@
master_table.to_pickle('../database/vdjdb_full.pkl', )

mask_gene_list = []
mask_alleles_list = []
for gene in ['alpha', 'beta']:
mask_gene_list.append(master_table[f'v.{gene}'].apply(gene_match_check))
mask_gene_list.append(master_table[f'j.{gene}'].apply(gene_match_check))
final_mask = mask_gene_list[0]
mask_alleles_list.append(master_table[f'v.{gene}'].apply(alleles_match_check))
mask_alleles_list.append(master_table[f'j.{gene}'].apply(alleles_match_check))

for mask in mask_gene_list[1:]:
final_mask = mask_gene_list[0]
final_mask_alleles = mask_alleles_list[0]
for mask, allele_mask in zip(mask_gene_list[1:], mask_alleles_list[1:]):
final_mask = final_mask & mask
final_mask_alleles = final_mask_alleles & allele_mask

master_table.loc[final_mask].set_index('cdr3.alpha').to_csv('../database/vdjdb_full_gene_clear.txt', sep='\t')
master_table.loc[final_mask & final_mask_alleles].set_index('cdr3.alpha').to_csv('../database/vdjdb_full_gene_clear.txt', sep='\t')
master_table.loc[~final_mask].set_index('cdr3.alpha').to_csv('../database/vdjdb_full_gene_broken.txt', sep='\t')
master_table.loc[~final_mask_alleles].set_index('cdr3.alpha').to_csv('../database/vdjdb_full_allele_broken.txt', sep='\t')

cprint("Generating and writing default database", 'magenta')
default_db = generate_default_db(master_table)
Expand Down

0 comments on commit a846511

Please sign in to comment.