Skip to content

Commit e85bd16

Browse files
committed
Add more comments to processor code
1 parent cc0a94f commit e85bd16

File tree

1 file changed

+45
-6
lines changed

1 file changed

+45
-6
lines changed

indra/sources/evex/processor.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,17 @@ def __init__(self, relations_table, articles_table, standoff_index):
2828
self.standoff_cache = {}
2929

3030
def process_statements(self):
31+
"""Process rows of the EXEV relations table into INDRA Statements."""
3132
for row in tqdm.tqdm(self.relations_table.itertuples(),
3233
total=len(self.relations_table),
3334
desc='Processing Evex relations'):
3435
self.statements += self.process_row(row)
3536

3637
def process_row(self, row):
3738
"""Process a row in the relations table into INDRA Statements."""
39+
40+
# First, we determine the statement type and create the subject/object
41+
# ageints.
3842
pol_idx = 1 if row.refined_polarity == 'Negative' else 0
3943
stmt_types = type_indra_mappings.get(row.refined_type)
4044
if not stmt_types:
@@ -47,19 +51,27 @@ def process_row(self, row):
4751
obj_agent = get_standard_agent('EGID:%s' % target_id,
4852
db_refs={'EGID': target_id})
4953

54+
# We now figure out what articles provide evidence for this relation
5055
article_keys = self.article_lookup.get(row.general_event_id)
5156
stmts = []
5257
for article_prefix, article_id in article_keys:
58+
# These text refs are known based on info we have independent of
59+
# standoff availability
5360
text_refs = {article_prefix: article_id}
5461
pmid = article_id if article_prefix == 'PMID' else None
5562

63+
# We not find the standoff for the given relation and gather
64+
# evidence info for it if possible.
5665
standoff = self.get_standoff_for_event(article_prefix, article_id)
5766
if not standoff:
5867
evidence_info = [{}]
5968
else:
6069
evidence_info = find_evidence_info(standoff, source_id,
6170
target_id, row.refined_type,
6271
row.refined_polarity)
72+
# For each article, it's possible that multiple evidences are
73+
# available for the relation so we create a separate Statements
74+
# (each with a single Evidence) here.
6375
for ev_info in evidence_info:
6476
annotations = {
6577
'evex_relation_type': row.refined_type,
@@ -76,12 +88,17 @@ def process_row(self, row):
7688
text_refs=text_refs,
7789
text=ev_info.get('text'),
7890
annotations=annotations)
91+
92+
# We can set the raw Agent text which is specific to this
93+
# given evidence.
7994
subj = copy.deepcopy(subj_agent)
8095
obj = copy.deepcopy(obj_agent)
8196
if ev_info.get('subj_text'):
8297
subj.db_refs['TEXT'] = ev_info.get('subj_text')
8398
if ev_info.get('obj_text'):
8499
obj.db_refs['TEXT'] = ev_info.get('obj_text')
100+
101+
# Finally, create the Statement object
85102
if stmt_type == Complex:
86103
stmt = Complex([subj, obj], evidence=[ev])
87104
else:
@@ -91,6 +108,7 @@ def process_row(self, row):
91108
return stmts
92109

93110
def get_standoff_for_event(self, article_prefix, article_id):
111+
"""Based on article info, return a standoff object of annotations."""
94112
key = (
95113
'pmc' if article_prefix == 'PMCID' else 'pubmed',
96114
article_id[3:] if article_prefix == 'PMCID' else article_id
@@ -201,6 +219,7 @@ def find_evidence_info(standoff, source_id, target_id, event_type,
201219

202220

203221
def get_regulation_info(standoff, regulation, source_uid, target_uid):
222+
"""Gather specific evidence info from a regulation in a standoff."""
204223
text = standoff.get_sentence_for_offset(regulation.event.start)
205224
subj = standoff.elements[source_uid]
206225
subj_text = subj.text
@@ -318,19 +337,30 @@ def process_annotations(ann_file):
318337
elements = {}
319338
reader = csv.reader(ann_file, delimiter='\t', quotechar=None)
320339
for row in reader:
340+
# The first element is always the UID
321341
uid = row[0]
322342
assert len(row) == 2 or len(row) == 3
343+
# If the row has 3 elements, then the last one is a value
323344
value = row[2] if len(row) == 3 else None
345+
# The second element can have multiple space-separated parts
324346
parts = row[1].split()
347+
# If this is an entity of some type
325348
if parts[0] in {'GGP', 'Entity'}:
326349
entity = Entity(uid, parts[0], int(parts[1]), int(parts[2]), value)
327350
elements[uid] = entity
351+
# These represent entity references like Entrez IDs
328352
elif parts[0] == 'Reference':
329353
ref_ns, ref_id = parts[2].split(':', maxsplit=1)
330354
elements[parts[1]].references[ref_ns] = ref_id
355+
# These are various event types, we enumerate them explicitly in
356+
# the standoff_event_types variable to make sure it's not some
357+
# other type of row.
331358
elif parts[0] in standoff_event_types:
332359
event = Event(uid, parts[0], int(parts[1]), int(parts[2]), value)
333360
elements[uid] = event
361+
# These are confidence values associated with regulations but also
362+
# other things like Negation. An additional complication is that it
363+
# can either represent a numerical of a qualitative confidence level.
334364
elif parts[0] == 'Confidence':
335365
# Negation confidence
336366
if isinstance(parts[1], Negation):
@@ -341,12 +371,16 @@ def process_annotations(ann_file):
341371
# Regulation confidence level
342372
else:
343373
elements[parts[1]].confidence_level = parts[2]
374+
# Represents a negation for a regulation
344375
elif parts[0] == 'Negation':
345376
elements[uid] = Negation(uid)
346377
elements[parts[1]].negation = elements[uid]
378+
# Represents a speculation for a regulation
347379
elif parts[0] == 'Speculation':
348380
elements[uid] = Speculation(uid)
349381
elements[parts[1]].speculation = elements[uid]
382+
# The remainder of cases are regulations. These are either basic
383+
# regulations or special cases like subunit-complex relations.
350384
elif len(row) == 2:
351385
if ':' in parts[0]:
352386
event_type, parent_id = parts[0].split(':')
@@ -360,6 +394,8 @@ def process_annotations(ann_file):
360394
else:
361395
assert False, row
362396

397+
# The row contains a series of arguments for the regulation that
398+
# need to be parsed out in parts
363399
arguments = {}
364400
for element in parts[1:]:
365401
role, arg_uid = element.split(':')
@@ -368,6 +404,9 @@ def process_annotations(ann_file):
368404
# placeholder for these elements that can be resolved later
369405
element_obj = elements.get(arg_uid, Unresolved(arg_uid))
370406

407+
# There are argument types that there are more than one of,
408+
# e.g., Theme for Binding so we need to sometimes turn
409+
# these into lists.
371410
if role in arguments:
372411
if not isinstance(arguments[role], list):
373412
arguments[role] = [arguments[role]]
@@ -377,7 +416,7 @@ def process_annotations(ann_file):
377416
regulation = Regulation(uid, event, arguments)
378417
elements[uid] = regulation
379418
else:
380-
print(row)
419+
logger.error('Could not process standoff file row: %s' % row)
381420
break
382421

383422
# We now need to resolve Unresolved regulation references. At this point
@@ -517,6 +556,11 @@ def paths_to_entrez_id(self, entrez_id):
517556
return paths
518557

519558

559+
@dataclass
560+
class Unresolved:
561+
uid: str
562+
563+
520564
def add_subgraph(g, obj):
521565
"""Recursively build up a graph of standoff objects."""
522566
label = '{ID | %s} | {event_type | %s}' % (obj.uid, obj.event.get_type())
@@ -542,11 +586,6 @@ def add_subgraph(g, obj):
542586
g.add_edge(obj.uid, vv.uid, label=k)
543587

544588

545-
@dataclass
546-
class Unresolved:
547-
uid: str
548-
549-
550589
# The set of event types used in the standoff format
551590
standoff_event_types = {
552591
'Binding',

0 commit comments

Comments
 (0)