Skip to content

Optimizing evidence representation #998

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
10 changes: 7 additions & 3 deletions indra/preassembler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def _ev_keys(sts):
if 'prior_uuids' not in ev.annotations:
ev.annotations['prior_uuids'] = []
ev.annotations['prior_uuids'].append(stmt.uuid)
new_stmt.evidence.append(ev)
new_stmt.add_evidence(ev)
ev_keys.add(ev_key)
end_ev_keys = _ev_keys([new_stmt])
if len(end_ev_keys) != len(start_ev_keys):
Expand Down Expand Up @@ -990,10 +990,14 @@ def flatten_evidence(stmts, collect_from=None):
def _flatten_evidence_for_stmt(stmt, collect_from):
supp_stmts = (stmt.supports if collect_from == 'supports'
else stmt.supported_by)
total_evidence = set(stmt.evidence)
evs = {ev.matches_key(): ev for ev in stmt.evidence}
total_evidence = set(evs.values())
for supp_stmt in supp_stmts:
child_evidence = _flatten_evidence_for_stmt(supp_stmt, collect_from)
total_evidence = total_evidence.union(child_evidence)
chevs = {ev.matches_key(): ev for ev in child_evidence}
for k, v in chevs.items():
evs[k] = v
total_evidence = list(evs.values())
return list(total_evidence)


Expand Down
7 changes: 6 additions & 1 deletion indra/preassembler/grounding_mapper/adeft.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ def run_adeft_disambiguation(stmt, agent, idx):
return False
# Initialize annotations if needed so Adeft predicted
# probabilities can be added to Agent annotations
annots = stmt.evidence[0].annotations

evs = stmt.evidence
# Note that the assumption here is that the statement only has a single
# piece of evidence (typically a raw statement)
annots = evs[0].annotations
agent_txt = agent.db_refs['TEXT']
if 'agents' in annots:
if 'adeft' not in annots['agents']:
Expand Down Expand Up @@ -89,6 +93,7 @@ def run_adeft_disambiguation(stmt, agent, idx):
standardize_refs=True)
annots['agents']['adeft'][idx] = disamb_scores
success = True
stmt.evidence = evs
return success


Expand Down
4 changes: 3 additions & 1 deletion indra/sources/sparser/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,10 @@ def set_statements_pmid(self, pmid):
ev['pmid'] = pmid
# Replace PMID value in extracted Statements next
for stmt in self.statements:
for ev in stmt.evidence:
evs = stmt.evidence
for ev in evs:
ev.pmid = pmid
stmt.evidence = evs


def _fix_agent(agent):
Expand Down
4 changes: 3 additions & 1 deletion indra/sources/trips/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,8 +813,10 @@ def get_cause_events(mod_event_types):

for stmt in stmts_to_make:
stmt.enz = enz
for ev in stmt.evidence:
evs = stmt.evidence
for ev in evs:
ev.epistemics['direct'] = False
stmt.evidence = evs
self.statements.append(stmt)

self._add_extracted(event_type, event.attrib['id'])
Expand Down
17 changes: 8 additions & 9 deletions indra/statements/evidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ class Evidence(object):
and is set by said Statement. It is useful for tracing ownership of
an Evidence object.
"""
__slots__ = ['source_api', 'source_id', 'pmid', 'text',
'text', 'annotations', 'epistemics', 'context', 'text_refs',
'source_hash', 'stmt_tag']

def __init__(self, source_api=None, source_id=None, pmid=None, text=None,
annotations=None, epistemics=None, context=None,
text_refs=None):
Expand All @@ -80,15 +84,10 @@ def __init__(self, source_api=None, source_id=None, pmid=None, text=None,
self.stmt_tag = None

def __setstate__(self, state):
if 'context' not in state:
state['context'] = None
if 'text_refs' not in state:
state['text_refs'] = {}
if 'stmt_tag' not in state:
state['stmt_tag'] = None
if 'source_hash' not in state:
state['source_hash'] = None
self.__dict__ = state
# With a slots-based object, state is a tuple, otherwise it's a dict
state = state[1] if isinstance(state, tuple) else state
for slot in self.__slots__:
setattr(self, slot, state.get(slot, None))

def get_source_hash(self, refresh=False):
"""Get a hash based off of the source of this statement.
Expand Down
50 changes: 39 additions & 11 deletions indra/statements/statements.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@
import abc
import sys
import uuid
import json
import gzip
import logging
import networkx
import itertools
Expand Down Expand Up @@ -253,16 +255,7 @@ class Statement(object):
_agent_order = NotImplemented

def __init__(self, evidence=None, supports=None, supported_by=None):
if evidence is None:
self.evidence = []
elif isinstance(evidence, Evidence):
self.evidence = [evidence]
elif isinstance(evidence, list):
self.evidence = evidence
else:
raise ValueError('evidence must be an Evidence object, a list '
'(of Evidence objects), or None.')

self.evidence = evidence
# Initialize supports/supported_by fields, which should be lists
self.supports = supports if supports else []
self.supported_by = supported_by if supported_by else []
Expand All @@ -272,6 +265,41 @@ def __init__(self, evidence=None, supports=None, supported_by=None):
self._shallow_hash = None
return

@property
def evidence(self):
# Decompress, decode, and then deserialize each Evidence from JSON
evs = [Evidence._from_json(e) for e in
json.loads(gzip.decompress(self._evidence).decode('utf-8'))]
return evs

@evidence.setter
def evidence(self, evidence):
if evidence is None:
evs = []
elif isinstance(evidence, Evidence):
evs = [evidence]
elif isinstance(evidence, list):
evs = evidence
else:
raise ValueError('evidence must be an Evidence object, a list '
'(of Evidence objects), or None.')
self._evidence = \
gzip.compress(json.dumps([e.to_json()
for e in evs]).encode('utf-8'))

def add_evidence(self, ev):
"""Extend the Statement's evidence list with a new Evidence.

Parameters
----------
ev : indra.statements.Evidence
An Evidence object to be added to the Statement's list of
evidences.
"""
evs = self.evidence
evs.append(ev)
self.evidence = evs

def matches_key(self):
raise NotImplementedError("Method must be implemented in child class.")

Expand Down Expand Up @@ -564,7 +592,7 @@ def make_generic_copy(self, deeply=False):
kwargs = deepcopy(self.__dict__)
else:
kwargs = self.__dict__.copy()
for attr in ['evidence', 'belief', 'uuid', 'supports', 'supported_by',
for attr in ['_evidence', 'belief', 'uuid', 'supports', 'supported_by',
'is_activation']:
kwargs.pop(attr, None)
for attr in ['_full_hash', '_shallow_hash']:
Expand Down
4 changes: 3 additions & 1 deletion indra/tests/test_assemble_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,11 +588,13 @@ def test_merge_groundings():

def test_merge_deltas():
def add_annots(stmt):
for ev in stmt.evidence:
evs = stmt.evidence
for ev in evs:
ev.annotations['subj_adjectives'] = stmt.subj.delta.adjectives
ev.annotations['obj_adjectives'] = stmt.obj.delta.adjectives
ev.annotations['subj_polarity'] = stmt.subj.delta.polarity
ev.annotations['obj_polarity'] = stmt.obj.delta.polarity
stmt.evidence = evs
return stmt
# d1 = {'adjectives': ['a', 'b', 'c'], 'polarity': 1}
# d2 = {'adjectives': [], 'polarity': -1}
Expand Down
4 changes: 2 additions & 2 deletions indra/tests/test_medscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,8 @@ def test_evidence():
coords = s0.evidence[0].annotations['agents']['coords']
assert isinstance(coords, list), type(coords)
assert len(coords) == 2, len(coords)
assert coords[0] == (90, 97), coords[0]
assert coords[1] == (106, 120), coords[1]
assert tuple(coords[0]) == (90, 97), tuple(coords[0])
assert tuple(coords[1]) == (106, 120), tuple(coords[1])


def test_molsynthesis_positive():
Expand Down
20 changes: 12 additions & 8 deletions indra/tests/test_preassembler.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,13 @@ def test_combine_duplicates():
# The statements come out sorted by their matches_key
assert len(pa.unique_stmts) == 4, len(pa.unique_stmts)
num_evs =[len(s.evidence) for s in pa.unique_stmts]
assert pa.unique_stmts[0].matches(p6) # MEK dephos ERK
assert num_evs[0] == 3, num_evs[0]
assert pa.unique_stmts[1].matches(p9) # SRC dephos KRAS
assert pa.unique_stmts[0].matches(p6) # MEK dephos ERK
assert num_evs[0] == 3, num_evs
assert pa.unique_stmts[1].matches(p9) # SRC dephos KRAS
assert num_evs[1] == 1, num_evs[1]
assert pa.unique_stmts[2].matches(p5) # MEK phos ERK
assert pa.unique_stmts[2].matches(p5) # MEK phos ERK
assert num_evs[2] == 1, num_evs[2]
assert pa.unique_stmts[3].matches(p1) # RAF phos MEK
assert pa.unique_stmts[3].matches(p1) # RAF phos MEK
assert num_evs[3] == 4, num_evs[3]


Expand Down Expand Up @@ -510,7 +510,9 @@ def test_flatten_evidence_hierarchy():
supporting_stmt = top_stmt.supported_by[0]
assert len(supporting_stmt.evidence) == 1
assert supporting_stmt.evidence[0].text == 'foo'
supporting_stmt.evidence[0].text = 'changed_foo'
evs = supporting_stmt.evidence
evs[0].text = 'changed_foo'
supporting_stmt.evidence = evs
assert supporting_stmt.evidence[0].text == 'changed_foo'
assert 'changed_foo' not in [e.text for e in top_stmt.evidence]
assert 'foo' in [e.text for e in top_stmt.evidence]
Expand Down Expand Up @@ -930,8 +932,10 @@ def test_agent_coordinates():
evidence_list = unique_stmt.evidence
agent_annots = [ev.annotations['agents'] for ev in unique_stmt.evidence]
assert all(a['raw_text'] == ['MEK1', 'ERK2'] for a in agent_annots)
assert {tuple(a['coords']) for a in agent_annots} == {((21, 25), (0, 4)),
((0, 4), (15, 19))}
expected_coords = {((21, 25), (0, 4)), ((0, 4), (15, 19))}
for annot in agent_annots:
coords = tuple(tuple(a) for a in annot['coords'])
assert coords in expected_coords


def test_association_duplicate():
Expand Down
12 changes: 6 additions & 6 deletions indra/tests/test_reach.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def test_get_agent_coordinates_phosphorylation():
stmt = rp.statements[0]
annotations = stmt.evidence[0].annotations

coords = [(0, 3), (42, 45)]
coords = [[0, 3], [42, 45]]
assert annotations['agents']['coords'] == coords


Expand All @@ -379,7 +379,7 @@ def test_get_agent_coordinates_activation():
rp = reach.process_text(test_case, offline=offline)
stmt = rp.statements[0]
annotations = stmt.evidence[0].annotations
coords = [(0, 4), (15, 19)]
coords = [[0, 4], [15, 19]]
assert annotations['agents']['coords'] == coords


Expand All @@ -389,7 +389,7 @@ def test_get_agent_coordinates_regulate_amount():
rp = reach.process_text(test_case, offline=offline)
stmt = rp.statements[0]
annotations = stmt.evidence[0].annotations
coords = [(0, 3), (35, 39)]
coords = [[0, 3], [35, 39]]
assert annotations['agents']['coords'] == coords


Expand All @@ -399,7 +399,7 @@ def test_get_agent_coordinates_binding():
rp = reach.process_text(test_case, offline=offline)
stmt = rp.statements[0]
annotations = stmt.evidence[0].annotations
coords = [(27, 31), (38, 42)]
coords = [[27, 31], [38, 42]]
assert annotations['agents']['coords'] == coords


Expand All @@ -412,7 +412,7 @@ def test_get_agent_coordinates_translocation():
stmt = [stmt for stmt in rp.statements if
isinstance(stmt, Translocation)][0]
annotations = stmt.evidence[0].annotations
coords = [(86, 89)]
coords = [[86, 89]]
assert annotations['agents']['coords'] == coords


Expand All @@ -426,5 +426,5 @@ def test_get_agent_coordinates_phosphorylation_missing_controller():
stmt = [stmt for stmt in rp.statements if
isinstance(stmt, Phosphorylation)][0]
annotations = stmt.evidence[0].annotations
coords = [None, (57, 60)]
coords = [None, [57, 60]]
assert annotations['agents']['coords'] == coords
1 change: 1 addition & 0 deletions indra/tests/test_statements_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def test_evidence_context():
assert evj['pmid'] == '1'
assert evj['annotations'] == {'a': '2'}
assert ev.to_json() == Evidence._from_json(ev.to_json()).to_json()
assert ev.matches_key() == Evidence._from_json(ev.to_json()).matches_key()


def test_file_serialization():
Expand Down
4 changes: 2 additions & 2 deletions indra/tests/test_trips_ekbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def test_53():
assert mek.name == 'MEK'
assert erk.name == 'ERK'
for ev in st.evidence:
assert ev.epistemics.get('direct') is False
assert ev.epistemics.get('direct') is False, ev.epistemics


def test_54():
Expand All @@ -753,7 +753,7 @@ def test_54():
assert mek.name == 'EGF'
assert erk.name == 'ERK'
for ev in st.evidence:
assert ev.epistemics.get('direct') is False
assert ev.epistemics.get('direct') is False, ev.epistemics


def test_55():
Expand Down
2 changes: 2 additions & 0 deletions indra/tools/assemble_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,9 +244,11 @@ def merge_deltas(stmts_in):
for info in ('polarity', 'adjectives'):
key = (role, info)
deltas[key] = []
evs = stmt.evidence
for ev in stmt.evidence:
entry = ev.annotations.get('%s_%s' % key)
deltas[key].append(entry if entry else None)
stmt.evidence = evs
# POLARITY
# For polarity we need to work in pairs
polarity_pairs = list(zip(deltas[('subj', 'polarity')],
Expand Down
6 changes: 6 additions & 0 deletions indra/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def timed_func(*args, **kwargs):


def unicode_strs(obj, attr_filter=None):
from indra.statements import Statement
if isinstance(obj, non_unicode):
return False
# Check for an iterable
Expand All @@ -46,6 +47,8 @@ def unicode_strs(obj, attr_filter=None):
return False
if hasattr(obj, '__dict__'):
for item_name, item in obj.__dict__.items():
if isinstance(obj, Statement) and item_name == '_evidence':
continue
if attr_filter and item_name in attr_filter:
continue
has_unicode_strs = unicode_strs(item)
Expand All @@ -61,12 +64,15 @@ def unicode_strs(obj, attr_filter=None):


def decode_obj(obj, encoding='utf-8'):
from indra.statements import Statement
if isinstance(obj, non_unicode):
return obj.decode(encoding)
elif isinstance(obj, list) or isinstance(obj, tuple):
return [decode_obj(item) for item in obj]
elif hasattr(obj, '__dict__'):
for k, v in obj.__dict__.items():
if isinstance(obj, Statement) and k == '_evidence':
continue
obj.__dict__[k] = decode_obj(v)
return obj
elif isinstance(obj, dict):
Expand Down