Skip to content

Commit 97252e1

Browse files
committed
fixed analysis_...
build connections to new database structure of the bigg tables #99
1 parent b6fd86b commit 97252e1

File tree

4 files changed

+90
-67
lines changed

4 files changed

+90
-67
lines changed

src/refinegems/analysis_biocyc.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import libchebipy
2626
import requests
2727
from refinegems.entities import get_model_genes, get_model_reacs_or_metabs, compare_gene_lists
28-
from refinegems.analysis_db import get_bigg2other_db, compare_bigg_model, add_stoichiometric_values_to_reacs, BIGG_METABOLITES_URL
28+
from refinegems.analysis_db import get_bigg_db_mapping, compare_bigg_model, add_stoichiometric_values_to_reacs, BIGG_METABOLITES_URL
2929
from refinegems.io import parse_fasta_headers
3030
import os
3131

@@ -191,7 +191,7 @@ def get_missing_reactions(
191191
statistics_df.loc['Reaction', 'Total'] = len(missing_reactions['Reaction'].unique().tolist())
192192

193193
# Get BiGG BioCyc
194-
bigg2biocyc_reacs = get_bigg2other_db('BioCyc')
194+
bigg2biocyc_reacs = get_bigg_db_mapping('BioCyc',False)
195195

196196
# Subset missing_reactions with BiGG BioCyc
197197
missing_reactions.rename(columns={'Reaction': 'BioCyc'}, inplace=True)
@@ -260,7 +260,7 @@ def get_missing_metabolites(
260260
statistics_df.loc['Metabolite', 'Total'] = len(biocyc_metabs_from_reacs['Compound'].unique().tolist())
261261

262262
# Get BiGG BioCyc
263-
bigg2biocyc_metabs = get_bigg2other_db('BioCyc', True)
263+
bigg2biocyc_metabs = get_bigg_db_mapping('BioCyc', True)
264264

265265
# Subset biocyc_metabs with BiGG BioCyc -> To get only metabolites with BiGG IDs
266266
missing_metabolites = bigg2biocyc_metabs.merge(biocyc_metabs, on='BioCyc') # missing_metabolites

src/refinegems/analysis_db.py

Lines changed: 71 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
#!/usr/bin/env python
2+
3+
__author__ = "Famke Baeuerle, Gwendolyn O. Döbel, Carolin Brune and Tobias Fehrenbach"
4+
5+
################################################################################
6+
# requirements
7+
################################################################################
8+
29
import re
310
import requests
411
import sqlite3
@@ -13,36 +20,24 @@
1320
from ratelimit import limits, sleep_and_retry
1421
from multiprocessing import Pool
1522

16-
__author__ = "Famke Baeuerle, Gwendolyn O. Döbel and Tobias Fehrenbach"
17-
23+
################################################################################
24+
# variables
25+
################################################################################
1826

1927
ALL_BIGG_COMPARTMENTS_ONE_LETTER = ('c', 'e', 'p', 'm', 'x', 'r', 'v', 'n', 'g', 'u', 'l', 'h', 'f', 's', 'i', 'w', 'y')
2028
ALL_BIGG_COMPARTMENTS_TWO_LETTER = ('im', 'cx', 'um', 'cm', 'mm')
2129
BIGG_REACTIONS_URL = 'http://bigg.ucsd.edu/api/v2/universal/reactions/'
2230
BIGG_METABOLITES_URL = 'http://bigg.ucsd.edu/api/v2/universal/metabolites/'
2331

32+
# .............................................
33+
# @TODO : merge with compartment in entities.py
34+
# .............................................
2435
COMPARTMENTS = ('c', 'e', 'p')
2536

26-
27-
def get_search_regex(other_db: Literal['KEGG', 'BioCyc', 'SEED'], metabolites: bool) -> str:
28-
"""Retrieves the search regex for BioCyc/KEGG/SEED to be used in the BiGG mapping
29-
30-
Args:
31-
- other_db (Literal): Specifies if the search regex should be for BioCyc/KEGG/SEED
32-
- metabolites (bool): Is required if one wants to search for KEGG/SEED Compound IDs in the bigg_models_metabolites.txt
33-
34-
Returns:
35-
str: Search regex
36-
"""
37-
if other_db == 'BioCyc':
38-
return 'BioCyc: http://identifiers.org/biocyc/META:(.*?);'
39-
elif other_db == 'KEGG' or other_db == 'SEED':
40-
if metabolites:
41-
return f'{other_db} Compound: http://identifiers.org/{other_db.lower()}.compound/(.*?);'
42-
else:
43-
return f'{other_db} Reaction: http://identifiers.org/{other_db.lower()}.reaction/(.*?);'
44-
45-
37+
################################################################################
38+
# functions
39+
################################################################################
40+
4641
def compare_ids(id1: str, id2: str) -> bool:
4742
"""Compares two strings/IDs & Returns True if one string matches most of the other
4843
@@ -119,20 +114,19 @@ def get_reaction_compartment(bigg_id: str) -> str:
119114
else: # Not so important but do not remove reaction as reaction in correct compartments
120115
return 'exchange' # Probably exchange reaction
121116

122-
117+
# @TEST
118+
# @NOTE : A lot of warnings
123119
def keep_only_reactions_in_certain_compartments(complete_df: pd.DataFrame) -> pd.DataFrame:
124120
"""Extracts all possible BiGG ID variations from database for a BiGG reaction ID, gets the metabolite compartments
125121
& returns table containing only reactions which happen in one of the provided compartments
126122
127123
Args:
128-
- complete_df (pd.DataFrame): Table containing at least the columns 'bigg_id' & 'KEGG'/'BioCyc'
124+
- complete_df (pd.DataFrame): Table containing at least the column 'bigg_id'.
129125
130126
Returns:
131127
pd.DataFrame: Table containing reactions & their compartments
132128
"""
133129
tqdm.pandas()
134-
db = 'KEGG' if 'KEGG' in complete_df.columns else 'BioCyc'
135-
complete_df = complete_df[['bigg_id', db]] # Remove all unnecessary columns
136130

137131
# (1) Find all occurrencs of a BiGG reaction ID in bigg_reactions table in database
138132
def get_all_similar_bigg_ids(bigg_id_in: str) -> list[str]:
@@ -142,7 +136,7 @@ def get_all_similar_bigg_ids(bigg_id_in: str) -> list[str]:
142136
elif bigg_id_in.endswith(ALL_BIGG_COMPARTMENTS_TWO_LETTER): bigg_id = bigg_id_in[:-2]
143137
else: bigg_id = bigg_id_in
144138

145-
query = f"SELECT bigg_id, INSTR(bigg_id, '{bigg_id}') bi FROM bigg_reactions WHERE bi > 0"
139+
query = f"SELECT id, INSTR(id, '{bigg_id}') bi FROM bigg_reactions WHERE bi > 0"
146140
result = con.execute(query).fetchall()
147141
result = [result_tuple[0] for result_tuple in result] if result else [bigg_id_in]
148142
result = [res for res in result if compare_ids(bigg_id, res)]
@@ -167,6 +161,8 @@ def multi_get_reaction_compartment(complete_df: pd.DataFrame) -> list:
167161

168162
return results
169163

164+
print(complete_df.columns)
165+
170166
# Connect to database & get similar IDs (1)
171167
print('Getting all similar IDs...')
172168
con = sqlite3.connect(PATH_TO_DB) # Open connection to database
@@ -193,47 +189,62 @@ def multi_get_reaction_compartment(complete_df: pd.DataFrame) -> list:
193189
return complete_df
194190

195191

196-
# Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis --- Modified
197-
def get_bigg2other_db(other_db: Literal['KEGG', 'BioCyc', 'SEED'], metabolites: bool=False) -> pd.DataFrame:
198-
"""Uses list of BiGG reactions/metabolites to get a mapping from BiGG to KEGG/BioCyc Id
192+
# @TEST
193+
def get_bigg_db_mapping(map_to:str='BioCyc', metabolites:bool=True) -> pd.DataFrame:
194+
"""Download a mapping of BiGG IDs to a specified database.
199195
200196
Args:
201-
- other_db (Literal): Set to 'KEGG'/'BioCyc'/'SEED' to map KEGG/BioCyc/SEED IDs to BiGG IDs
202-
- metabolites (bool): Set to True to map other_db IDs to BiGG IDs for metabolites
197+
map_to (str, optional): Name of the database to map to.
198+
Ideally a column of the table in the database,
199+
but SEED, KEGG and BioCyc are valid as well.
200+
Defaults to 'BioCyc'.
201+
metabolites (bool, optional): Flag to map reaction (False) or metabolite (True) IDs.
202+
Defaults to True.
203+
204+
Raises:
205+
KeyError: Given database name not found in database. Cannot perform mapping.
203206
204207
Returns:
205-
pd.DataFrame: Table containing BiGG Ids with corresponding KEGG/BioCyc/SEED Ids
208+
pd.DataFrame: The mapping as a table.
206209
"""
210+
211+
# adjust name to map to if necessary
212+
reac_or_comp = 'Compound' if metabolites else 'Reaction'
213+
table_name = 'bigg_metabolites' if metabolites else 'bigg_reactions'
214+
if map_to in ['SEED','KEGG','Reactome']:
215+
map_to = ' '.join([map_to, reac_or_comp])
207216

208-
# Get only rows with BioCyc/KEGG entries
209-
db_table_name = 'bigg_metabolites' if metabolites else 'bigg_reactions'
210-
reaction_or_compound = 'Compound' if metabolites else 'Reaction'
211-
other_db_query = other_db if other_db == 'BioCyc' else ' '.join([other_db, reaction_or_compound])
212-
bigg_db_query = f"SELECT *, INSTR(database_links, '{other_db_query}:') o_db FROM {db_table_name} WHERE o_db > 0"
213-
bigg_db_df = load_a_table_from_database(bigg_db_query)
214-
215-
db_search_regex = get_search_regex(other_db, metabolites)
216-
217-
def find_other_db(database_links: str):
218-
m = re.findall(
219-
db_search_regex,
220-
str(database_links))
221-
if m:
222-
return m
223-
else:
224-
return None
225-
226-
bigg_db_df[other_db] = bigg_db_df.apply(
227-
lambda row: find_other_db(row['database_links']), axis=1)
228-
bigg_db_df = bigg_db_df.explode(other_db, ignore_index=True)
229-
217+
# download BiGG tables from database
218+
# ----------------------------------
219+
# build connection to DB
220+
connection = sqlite3.connect(PATH_TO_DB)
221+
cursor = connection.cursor()
222+
223+
# retrieve only mappings to a specific database
224+
result = cursor.execute('SELECT 1 FROM PRAGMA_TABLE_INFO(?) WHERE name = ?',(table_name,map_to))
225+
possible_db = result.fetchone()
226+
if possible_db:
227+
query = f'SELECT * FROM {table_name} WHERE {map_to} IS NOT NULL'
228+
else:
229+
raise KeyError('Given database name not found in database. Cannot perform mapping.')
230+
231+
# actually load data
232+
data = load_a_table_from_database(query)
233+
data = data.explode(map_to, ignore_index=True)
234+
235+
# reduce columns to mapping only
236+
data = data[['id',map_to]]
237+
data.rename(columns={'id':'bigg_id'}, inplace=True)
238+
239+
# filter for compartment in case of reactions
230240
if not metabolites:
231-
bigg_db_df = keep_only_reactions_in_certain_compartments(bigg_db_df)
232-
233-
bigg_df = bigg_db_df[['bigg_id', other_db]] if metabolites else bigg_db_df[['bigg_id', other_db, 'compartment', 'id_group']]
241+
data = keep_only_reactions_in_certain_compartments(data)
242+
243+
# close connection to database
244+
connection.close()
245+
246+
return data
234247

235-
return bigg_df
236-
237248

238249
# Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis --- Modified
239250
def compare_bigg_model(complete_df: pd.DataFrame, model_entities: pd.DataFrame, metabolites: bool=False) -> pd.DataFrame:

src/refinegems/analysis_kegg.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from libsbml import Model as libModel
1818
from refinegems.io import parse_gff_for_gp_info
1919
from refinegems.entities import get_model_genes, compare_gene_lists, get_model_reacs_or_metabs
20-
from refinegems.analysis_db import get_bigg2other_db, compare_bigg_model
20+
from refinegems.analysis_db import get_bigg_db_mapping, compare_bigg_model
2121

2222
__author__ = "Famke Baeuerle"
2323

@@ -177,7 +177,7 @@ def kegg_gene_comp(model: libModel, organismid: str, gff_file: str) -> pd.DataFr
177177
model_genes = get_model_genes(model, True)
178178
model_reactions = get_model_reacs_or_metabs(model)
179179
kegg_genes = get_kegg_genes(organismid)
180-
bigg_kegg = get_bigg2other_db('KEGG')
180+
bigg_kegg = get_bigg_db_mapping('KEGG',False)
181181
genes_kegg_notmodel = compare_gene_lists(model_genes, kegg_genes)
182182
locus_gpr = parse_gff_for_gp_info(gff_file)
183183
locus_ec = get_locus_ec(genes_kegg_notmodel)

src/refinegems/databases.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
#!/usr/bin/env python
2+
3+
################################################################################
4+
# requirements
5+
################################################################################
6+
27
import io
38
import re
49
import sqlite3
@@ -10,13 +15,21 @@
1015
from pathlib import Path
1116
from importlib.resources import files
1217

13-
__author__ = 'Gwendolyn O. Döbel'
18+
__author__ = 'Gwendolyn O. Döbel und Carolin Brune'
19+
20+
################################################################################
21+
# variables
22+
################################################################################
1423

1524
PATH_TO_DB_FOLDER = files('refinegems.data.database')
1625
PATH_TO_DB = PATH_TO_DB_FOLDER.joinpath('data.db')
1726
VERSION_FILE = PATH_TO_DB_FOLDER.joinpath('current_bigg_db_version.txt')
1827
VERSION_URL = 'http://bigg.ucsd.edu/api/v2/database_version'
1928

29+
################################################################################
30+
# functions
31+
################################################################################
32+
2033
class ValidationCodes(Enum):
2134
"""Validation codes for the database
2235
@@ -116,7 +129,6 @@ def create_sbo_media_database(db_cursor: sqlite3.Cursor):
116129
db_cursor.executescript(schema.read())
117130

118131

119-
# @TEST
120132
def update_bigg_db(latest_version: str, db_connection: sqlite3.Connection) -> dict:
121133
"""Updates the BiGG tables 'bigg_metabolites' & 'bigg_reactions' within a database (data.db)
122134

0 commit comments

Comments
 (0)