1
1
#!/usr/bin/env python
2
+
3
+ __author__ = "Famke Baeuerle, Gwendolyn O. Döbel, Carolin Brune and Tobias Fehrenbach"
4
+
5
+ ################################################################################
6
+ # requirements
7
+ ################################################################################
8
+
2
9
import re
3
10
import requests
4
11
import sqlite3
13
20
from ratelimit import limits , sleep_and_retry
14
21
from multiprocessing import Pool
15
22
16
- __author__ = "Famke Baeuerle, Gwendolyn O. Döbel and Tobias Fehrenbach"
17
-
23
+ ################################################################################
24
+ # variables
25
+ ################################################################################
18
26
19
27
ALL_BIGG_COMPARTMENTS_ONE_LETTER = ('c' , 'e' , 'p' , 'm' , 'x' , 'r' , 'v' , 'n' , 'g' , 'u' , 'l' , 'h' , 'f' , 's' , 'i' , 'w' , 'y' )
20
28
ALL_BIGG_COMPARTMENTS_TWO_LETTER = ('im' , 'cx' , 'um' , 'cm' , 'mm' )
21
29
BIGG_REACTIONS_URL = 'http://bigg.ucsd.edu/api/v2/universal/reactions/'
22
30
BIGG_METABOLITES_URL = 'http://bigg.ucsd.edu/api/v2/universal/metabolites/'
23
31
32
+ # .............................................
33
+ # @TODO : merge with compartment in entities.py
34
+ # .............................................
24
35
COMPARTMENTS = ('c' , 'e' , 'p' )
25
36
26
-
27
- def get_search_regex (other_db : Literal ['KEGG' , 'BioCyc' , 'SEED' ], metabolites : bool ) -> str :
28
- """Retrieves the search regex for BioCyc/KEGG/SEED to be used in the BiGG mapping
29
-
30
- Args:
31
- - other_db (Literal): Specifies if the search regex should be for BioCyc/KEGG/SEED
32
- - metabolites (bool): Is required if one wants to search for KEGG/SEED Compound IDs in the bigg_models_metabolites.txt
33
-
34
- Returns:
35
- str: Search regex
36
- """
37
- if other_db == 'BioCyc' :
38
- return 'BioCyc: http://identifiers.org/biocyc/META:(.*?);'
39
- elif other_db == 'KEGG' or other_db == 'SEED' :
40
- if metabolites :
41
- return f'{ other_db } Compound: http://identifiers.org/{ other_db .lower ()} .compound/(.*?);'
42
- else :
43
- return f'{ other_db } Reaction: http://identifiers.org/{ other_db .lower ()} .reaction/(.*?);'
44
-
45
-
37
+ ################################################################################
38
+ # functions
39
+ ################################################################################
40
+
46
41
def compare_ids (id1 : str , id2 : str ) -> bool :
47
42
"""Compares two strings/IDs & Returns True if one string matches most of the other
48
43
@@ -119,20 +114,19 @@ def get_reaction_compartment(bigg_id: str) -> str:
119
114
else : # Not so important but do not remove reaction as reaction in correct compartments
120
115
return 'exchange' # Probably exchange reaction
121
116
122
-
117
+ # @TEST
118
+ # @NOTE : A lot of warnings
123
119
def keep_only_reactions_in_certain_compartments (complete_df : pd .DataFrame ) -> pd .DataFrame :
124
120
"""Extracts all possible BiGG ID variations from database for a BiGG reaction ID, gets the metabolite compartments
125
121
& returns table containing only reactions which happen in one of the provided compartments
126
122
127
123
Args:
128
- - complete_df (pd.DataFrame): Table containing at least the columns 'bigg_id' & 'KEGG'/'BioCyc'
124
+ - complete_df (pd.DataFrame): Table containing at least the column 'bigg_id'.
129
125
130
126
Returns:
131
127
pd.DataFrame: Table containing reactions & their compartments
132
128
"""
133
129
tqdm .pandas ()
134
- db = 'KEGG' if 'KEGG' in complete_df .columns else 'BioCyc'
135
- complete_df = complete_df [['bigg_id' , db ]] # Remove all unnecessary columns
136
130
137
131
# (1) Find all occurrencs of a BiGG reaction ID in bigg_reactions table in database
138
132
def get_all_similar_bigg_ids (bigg_id_in : str ) -> list [str ]:
@@ -142,7 +136,7 @@ def get_all_similar_bigg_ids(bigg_id_in: str) -> list[str]:
142
136
elif bigg_id_in .endswith (ALL_BIGG_COMPARTMENTS_TWO_LETTER ): bigg_id = bigg_id_in [:- 2 ]
143
137
else : bigg_id = bigg_id_in
144
138
145
- query = f"SELECT bigg_id , INSTR(bigg_id , '{ bigg_id } ') bi FROM bigg_reactions WHERE bi > 0"
139
+ query = f"SELECT id , INSTR(id , '{ bigg_id } ') bi FROM bigg_reactions WHERE bi > 0"
146
140
result = con .execute (query ).fetchall ()
147
141
result = [result_tuple [0 ] for result_tuple in result ] if result else [bigg_id_in ]
148
142
result = [res for res in result if compare_ids (bigg_id , res )]
@@ -167,6 +161,8 @@ def multi_get_reaction_compartment(complete_df: pd.DataFrame) -> list:
167
161
168
162
return results
169
163
164
+ print (complete_df .columns )
165
+
170
166
# Connect to database & get similar IDs (1)
171
167
print ('Getting all similar IDs...' )
172
168
con = sqlite3 .connect (PATH_TO_DB ) # Open connection to database
@@ -193,47 +189,62 @@ def multi_get_reaction_compartment(complete_df: pd.DataFrame) -> list:
193
189
return complete_df
194
190
195
191
196
- # Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis --- Modified
197
- def get_bigg2other_db ( other_db : Literal [ 'KEGG' , ' BioCyc' , 'SEED' ], metabolites : bool = False ) -> pd .DataFrame :
198
- """Uses list of BiGG reactions/metabolites to get a mapping from BiGG to KEGG/BioCyc Id
192
+ # @TEST
193
+ def get_bigg_db_mapping ( map_to : str = ' BioCyc' , metabolites :bool = True ) -> pd .DataFrame :
194
+ """Download a mapping of BiGG IDs to a specified database.
199
195
200
196
Args:
201
- - other_db (Literal): Set to 'KEGG'/'BioCyc'/'SEED' to map KEGG/BioCyc/SEED IDs to BiGG IDs
202
- - metabolites (bool): Set to True to map other_db IDs to BiGG IDs for metabolites
197
+ map_to (str, optional): Name of the database to map to.
198
+ Ideally a column of the table in the database,
199
+ but SEED, KEGG and BioCyc are valid as well.
200
+ Defaults to 'BioCyc'.
201
+ metabolites (bool, optional): Flag to map reaction (False) or metabolite (True) IDs.
202
+ Defaults to True.
203
+
204
+ Raises:
205
+ KeyError: Given database name not found in database. Cannot perform mapping.
203
206
204
207
Returns:
205
- pd.DataFrame: Table containing BiGG Ids with corresponding KEGG/BioCyc/SEED Ids
208
+ pd.DataFrame: The mapping as a table.
206
209
"""
210
+
211
+ # adjust name to map to if necessary
212
+ reac_or_comp = 'Compound' if metabolites else 'Reaction'
213
+ table_name = 'bigg_metabolites' if metabolites else 'bigg_reactions'
214
+ if map_to in ['SEED' ,'KEGG' ,'Reactome' ]:
215
+ map_to = ' ' .join ([map_to , reac_or_comp ])
207
216
208
- # Get only rows with BioCyc/KEGG entries
209
- db_table_name = 'bigg_metabolites' if metabolites else 'bigg_reactions'
210
- reaction_or_compound = 'Compound' if metabolites else 'Reaction'
211
- other_db_query = other_db if other_db == 'BioCyc' else ' ' .join ([other_db , reaction_or_compound ])
212
- bigg_db_query = f"SELECT *, INSTR(database_links, '{ other_db_query } :') o_db FROM { db_table_name } WHERE o_db > 0"
213
- bigg_db_df = load_a_table_from_database (bigg_db_query )
214
-
215
- db_search_regex = get_search_regex (other_db , metabolites )
216
-
217
- def find_other_db (database_links : str ):
218
- m = re .findall (
219
- db_search_regex ,
220
- str (database_links ))
221
- if m :
222
- return m
223
- else :
224
- return None
225
-
226
- bigg_db_df [other_db ] = bigg_db_df .apply (
227
- lambda row : find_other_db (row ['database_links' ]), axis = 1 )
228
- bigg_db_df = bigg_db_df .explode (other_db , ignore_index = True )
229
-
217
+ # download BiGG tables from database
218
+ # ----------------------------------
219
+ # build connection to DB
220
+ connection = sqlite3 .connect (PATH_TO_DB )
221
+ cursor = connection .cursor ()
222
+
223
+ # retrieve only mappings to a specific database
224
+ result = cursor .execute ('SELECT 1 FROM PRAGMA_TABLE_INFO(?) WHERE name = ?' ,(table_name ,map_to ))
225
+ possible_db = result .fetchone ()
226
+ if possible_db :
227
+ query = f'SELECT * FROM { table_name } WHERE { map_to } IS NOT NULL'
228
+ else :
229
+ raise KeyError ('Given database name not found in database. Cannot perform mapping.' )
230
+
231
+ # actually load data
232
+ data = load_a_table_from_database (query )
233
+ data = data .explode (map_to , ignore_index = True )
234
+
235
+ # reduce columns to mapping only
236
+ data = data [['id' ,map_to ]]
237
+ data .rename (columns = {'id' :'bigg_id' }, inplace = True )
238
+
239
+ # filter for compartment in case of reactions
230
240
if not metabolites :
231
- bigg_db_df = keep_only_reactions_in_certain_compartments (bigg_db_df )
232
-
233
- bigg_df = bigg_db_df [['bigg_id' , other_db ]] if metabolites else bigg_db_df [['bigg_id' , other_db , 'compartment' , 'id_group' ]]
241
+ data = keep_only_reactions_in_certain_compartments (data )
242
+
243
+ # close connection to database
244
+ connection .close ()
245
+
246
+ return data
234
247
235
- return bigg_df
236
-
237
248
238
249
# Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis --- Modified
239
250
def compare_bigg_model (complete_df : pd .DataFrame , model_entities : pd .DataFrame , metabolites : bool = False ) -> pd .DataFrame :
0 commit comments