@@ -28,13 +28,17 @@ def __init__(self, relations_table, articles_table, standoff_index):
28
28
self .standoff_cache = {}
29
29
30
30
def process_statements (self ):
31
+ """Process rows of the EXEV relations table into INDRA Statements."""
31
32
for row in tqdm .tqdm (self .relations_table .itertuples (),
32
33
total = len (self .relations_table ),
33
34
desc = 'Processing Evex relations' ):
34
35
self .statements += self .process_row (row )
35
36
36
37
def process_row (self , row ):
37
38
"""Process a row in the relations table into INDRA Statements."""
39
+
40
+ # First, we determine the statement type and create the subject/object
41
+ # ageints.
38
42
pol_idx = 1 if row .refined_polarity == 'Negative' else 0
39
43
stmt_types = type_indra_mappings .get (row .refined_type )
40
44
if not stmt_types :
@@ -47,19 +51,27 @@ def process_row(self, row):
47
51
obj_agent = get_standard_agent ('EGID:%s' % target_id ,
48
52
db_refs = {'EGID' : target_id })
49
53
54
+ # We now figure out what articles provide evidence for this relation
50
55
article_keys = self .article_lookup .get (row .general_event_id )
51
56
stmts = []
52
57
for article_prefix , article_id in article_keys :
58
+ # These text refs are known based on info we have independent of
59
+ # standoff availability
53
60
text_refs = {article_prefix : article_id }
54
61
pmid = article_id if article_prefix == 'PMID' else None
55
62
63
+ # We not find the standoff for the given relation and gather
64
+ # evidence info for it if possible.
56
65
standoff = self .get_standoff_for_event (article_prefix , article_id )
57
66
if not standoff :
58
67
evidence_info = [{}]
59
68
else :
60
69
evidence_info = find_evidence_info (standoff , source_id ,
61
70
target_id , row .refined_type ,
62
71
row .refined_polarity )
72
+ # For each article, it's possible that multiple evidences are
73
+ # available for the relation so we create a separate Statements
74
+ # (each with a single Evidence) here.
63
75
for ev_info in evidence_info :
64
76
annotations = {
65
77
'evex_relation_type' : row .refined_type ,
@@ -76,12 +88,17 @@ def process_row(self, row):
76
88
text_refs = text_refs ,
77
89
text = ev_info .get ('text' ),
78
90
annotations = annotations )
91
+
92
+ # We can set the raw Agent text which is specific to this
93
+ # given evidence.
79
94
subj = copy .deepcopy (subj_agent )
80
95
obj = copy .deepcopy (obj_agent )
81
96
if ev_info .get ('subj_text' ):
82
97
subj .db_refs ['TEXT' ] = ev_info .get ('subj_text' )
83
98
if ev_info .get ('obj_text' ):
84
99
obj .db_refs ['TEXT' ] = ev_info .get ('obj_text' )
100
+
101
+ # Finally, create the Statement object
85
102
if stmt_type == Complex :
86
103
stmt = Complex ([subj , obj ], evidence = [ev ])
87
104
else :
@@ -91,6 +108,7 @@ def process_row(self, row):
91
108
return stmts
92
109
93
110
def get_standoff_for_event (self , article_prefix , article_id ):
111
+ """Based on article info, return a standoff object of annotations."""
94
112
key = (
95
113
'pmc' if article_prefix == 'PMCID' else 'pubmed' ,
96
114
article_id [3 :] if article_prefix == 'PMCID' else article_id
@@ -201,6 +219,7 @@ def find_evidence_info(standoff, source_id, target_id, event_type,
201
219
202
220
203
221
def get_regulation_info (standoff , regulation , source_uid , target_uid ):
222
+ """Gather specific evidence info from a regulation in a standoff."""
204
223
text = standoff .get_sentence_for_offset (regulation .event .start )
205
224
subj = standoff .elements [source_uid ]
206
225
subj_text = subj .text
@@ -318,19 +337,30 @@ def process_annotations(ann_file):
318
337
elements = {}
319
338
reader = csv .reader (ann_file , delimiter = '\t ' , quotechar = None )
320
339
for row in reader :
340
+ # The first element is always the UID
321
341
uid = row [0 ]
322
342
assert len (row ) == 2 or len (row ) == 3
343
+ # If the row has 3 elements, then the last one is a value
323
344
value = row [2 ] if len (row ) == 3 else None
345
+ # The second element can have multiple space-separated parts
324
346
parts = row [1 ].split ()
347
+ # If this is an entity of some type
325
348
if parts [0 ] in {'GGP' , 'Entity' }:
326
349
entity = Entity (uid , parts [0 ], int (parts [1 ]), int (parts [2 ]), value )
327
350
elements [uid ] = entity
351
+ # These represent entity references like Entrez IDs
328
352
elif parts [0 ] == 'Reference' :
329
353
ref_ns , ref_id = parts [2 ].split (':' , maxsplit = 1 )
330
354
elements [parts [1 ]].references [ref_ns ] = ref_id
355
+ # These are various event types, we enumerate them explicitly in
356
+ # the standoff_event_types variable to make sure it's not some
357
+ # other type of row.
331
358
elif parts [0 ] in standoff_event_types :
332
359
event = Event (uid , parts [0 ], int (parts [1 ]), int (parts [2 ]), value )
333
360
elements [uid ] = event
361
+ # These are confidence values associated with regulations but also
362
+ # other things like Negation. An additional complication is that it
363
+ # can either represent a numerical of a qualitative confidence level.
334
364
elif parts [0 ] == 'Confidence' :
335
365
# Negation confidence
336
366
if isinstance (parts [1 ], Negation ):
@@ -341,12 +371,16 @@ def process_annotations(ann_file):
341
371
# Regulation confidence level
342
372
else :
343
373
elements [parts [1 ]].confidence_level = parts [2 ]
374
+ # Represents a negation for a regulation
344
375
elif parts [0 ] == 'Negation' :
345
376
elements [uid ] = Negation (uid )
346
377
elements [parts [1 ]].negation = elements [uid ]
378
+ # Represents a speculation for a regulation
347
379
elif parts [0 ] == 'Speculation' :
348
380
elements [uid ] = Speculation (uid )
349
381
elements [parts [1 ]].speculation = elements [uid ]
382
+ # The remainder of cases are regulations. These are either basic
383
+ # regulations or special cases like subunit-complex relations.
350
384
elif len (row ) == 2 :
351
385
if ':' in parts [0 ]:
352
386
event_type , parent_id = parts [0 ].split (':' )
@@ -360,6 +394,8 @@ def process_annotations(ann_file):
360
394
else :
361
395
assert False , row
362
396
397
+ # The row contains a series of arguments for the regulation that
398
+ # need to be parsed out in parts
363
399
arguments = {}
364
400
for element in parts [1 :]:
365
401
role , arg_uid = element .split (':' )
@@ -368,6 +404,9 @@ def process_annotations(ann_file):
368
404
# placeholder for these elements that can be resolved later
369
405
element_obj = elements .get (arg_uid , Unresolved (arg_uid ))
370
406
407
+ # There are argument types that there are more than one of,
408
+ # e.g., Theme for Binding so we need to sometimes turn
409
+ # these into lists.
371
410
if role in arguments :
372
411
if not isinstance (arguments [role ], list ):
373
412
arguments [role ] = [arguments [role ]]
@@ -377,7 +416,7 @@ def process_annotations(ann_file):
377
416
regulation = Regulation (uid , event , arguments )
378
417
elements [uid ] = regulation
379
418
else :
380
- print ( row )
419
+ logger . error ( 'Could not process standoff file row: %s' % row )
381
420
break
382
421
383
422
# We now need to resolve Unresolved regulation references. At this point
@@ -517,6 +556,11 @@ def paths_to_entrez_id(self, entrez_id):
517
556
return paths
518
557
519
558
559
+ @dataclass
560
+ class Unresolved :
561
+ uid : str
562
+
563
+
520
564
def add_subgraph (g , obj ):
521
565
"""Recursively build up a graph of standoff objects."""
522
566
label = '{ID | %s} | {event_type | %s}' % (obj .uid , obj .event .get_type ())
@@ -542,11 +586,6 @@ def add_subgraph(g, obj):
542
586
g .add_edge (obj .uid , vv .uid , label = k )
543
587
544
588
545
- @dataclass
546
- class Unresolved :
547
- uid : str
548
-
549
-
550
589
# The set of event types used in the standoff format
551
590
standoff_event_types = {
552
591
'Binding' ,
0 commit comments