Skip to content

Commit 57d21c3

Browse files
committed
add left, both direction support, bug fixes, error handling
1 parent 48edda0 commit 57d21c3

File tree

4 files changed

+165
-97
lines changed

4 files changed

+165
-97
lines changed

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"python.pythonPath": "/anaconda3/envs/extractacy/bin/python"
3+
}

README.md

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,47 +36,37 @@ nlp.add_pipe(ruler, last=True)
3636

3737
# Define ent_patterns for value extraction
3838
ent_patterns = {
39-
"DISCHARGE_DATE": {"n_tokens": {"n": 1, "direction":"right"}},
40-
"TEMP_READING": {"pattern_match": {"patterns": [
41-
[{"LIKE_NUM": True},{"LOWER": {"IN": ["degrees", "farenheit", "celcius", "centigrade"]}}]],
42-
"n": 3, "direction": "right"
43-
}
44-
},
39+
"DISCHARGE_DATE": {"patterns": [[{"SHAPE": "dd/dd/dddd"}]],"n": 2, "direction": "right"},
40+
"TEMP_READING": {"patterns": [[
41+
{"LIKE_NUM": True},
42+
{"LOWER": {"IN": ["f", "c", "farenheit", "celcius", "centigrade", "degrees"]}
43+
},
44+
]
45+
],
46+
"n": "sent",
47+
"direction": "both"
48+
},
4549
}
4650

4751
valext = ValueExtractor(nlp, ent_patterns)
4852
nlp.add_pipe(valext, last=True)
4953

50-
doc = nlp("Discharge Date: November 15, 2008. Patient had temp reading of 102.6 degrees.")
54+
doc = nlp("Discharge Date: 11/15/2008. Patient had temp reading of 102.6 degrees.")
5155
for e in doc.ents:
5256
if e._.value_extract:
5357
print(e.text, e.label_, e._.value_extract)
54-
## Discharge Date DISCHARGE_DATE November 15, 2008
58+
## Discharge Date DISCHARGE_DATE 11/15/2008
5559
## temp reading TEMP_READING 102.6 degrees
5660
```
5761

5862
### Value Extraction patterns
59-
There are two options for extracting values: n tokens and first found pattern.
60-
61-
#### N Tokens
62-
This method will return n tokens past an entity of interest.
63+
Returns all patterns within n tokens of entity of interest or within the same sentence. It relies on [spaCy token matching syntax](https://spacy.io/usage/rule-based-matching#matcher).
6364

64-
**Note:**
65-
* if the immediate next token is whitespace or punctuation, it will be skipped.
66-
* if the span of n tokens is part of an entity, the entire entity will be returned, even if it is past n tokens
6765
```python
68-
{"ENTITY_NAME": {"n_tokens": {"n": 1, "direction": "right"}}}
69-
```
70-
71-
#### Pattern Match
72-
This method will return the first found pattern past an entity of interest within n tokens or within the same sentence. It relies on [spaCy token matching syntax](https://spacy.io/usage/rule-based-matching#matcher).
73-
74-
```python
75-
{"ENTITY_NAME": {"pattern_match": {"patterns":[{"LOWER":"awesome"}, {"LOWER":"pattern"}], "n": 5, "direction": "right"}}}
66+
{"ENTITY_NAME":{"patterns":[{"LOWER":"awesome"}, {"LOWER":"pattern"}], "n": 5, "direction": "right"}}
7667
```
7768
Use `"n":"sent"` for within sentence method rather than n tokens.
7869

79-
8070
## Contributing
8171
[contributing](https://github.com/jenojp/negspacy/blob/master/CONTRIBUTING.md)
8272

extractacy/extract.py

Lines changed: 100 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,41 @@
11
from spacy.matcher import Matcher
22
from spacy.tokens import Token, Doc, Span
33

4-
#TODO: handle left direction
4+
#TODO: error handling for out of bounds token indexes (start and end of doc)
55

66
class ValueExtractor(object):
77
def __init__(self, nlp, ent_patterns):
88

9-
Span.set_extension("value_extract", default=None, force=True)
9+
Span.set_extension("value_extract", default=[], force=True)
1010
self.nlp = nlp
1111
self.ent_patterns = ent_patterns
1212

1313
self.matcher = Matcher(nlp.vocab)
1414

1515
for key, value in self.ent_patterns.items():
16-
if "pattern_match" in value:
17-
for pattern in value["pattern_match"]["patterns"]:
18-
self.matcher.add("_"+str(key), None, pattern)
19-
print(len(self.matcher))
16+
patterns = value["patterns"]
17+
self.matcher.add("_"+str(key), patterns)
18+
2019
def __call__(self, doc):
2120
"""Apply the pipeline component on a Doc object and Return
2221
the Doc, so it can be processed by the next component
2322
in the pipeline, if available.
2423
"""
2524
matches = self.matcher(doc)
26-
print(matches)
2725
for e in doc.ents:
2826
if e.end >= len(doc) or e.label_ not in self.ent_patterns.keys():
29-
e._.value_extract = None
27+
e._.value_extract = []
3028
else:
31-
if "n_tokens" in self.ent_patterns[e.label_].keys():
32-
e._.value_extract = self.get_n_tokens(
33-
doc, e, self.ent_patterns[e.label_]["n_tokens"]["n"],
34-
self.ent_patterns[e.label_]["n_tokens"]["direction"]
35-
)
36-
if "pattern_match" in self.ent_patterns[e.label_].keys():
37-
e._.value_extract = self.get_pattern_match(
38-
doc, e, matches, self.ent_patterns[e.label_]["pattern_match"]["n"],
39-
self.ent_patterns[e.label_]["pattern_match"]["direction"]
40-
)
29+
# if "n_tokens" in self.ent_patterns[e.label_].keys():
30+
# e._.value_extract = self.get_n_tokens(
31+
# doc, e, self.ent_patterns[e.label_]["n_tokens"]["n"],
32+
# self.ent_patterns[e.label_]["n_tokens"]["direction"]
33+
# )
34+
# if "pattern_match" in self.ent_patterns[e.label_].keys():
35+
e._.value_extract = self.get_pattern_match(
36+
doc, e, matches, self.ent_patterns[e.label_]["n"],
37+
self.ent_patterns[e.label_]["direction"]
38+
)
4139
return doc
4240

4341
def get_pattern_match(self, doc, entity, matches, n, direction):
@@ -46,53 +44,96 @@ def get_pattern_match(self, doc, entity, matches, n, direction):
4644
in same sentence (if n == "sent")
4745
"""
4846
if type(n) == int:
49-
boundary_idx = entity.end+(n-1)
47+
if direction == "left":
48+
boundary_i = entity.start
49+
start_i = max(entity.start-n, 0)
50+
else:
51+
boundary_i = min(entity.end+(n-1), len(doc))
52+
if direction == "right":
53+
start_i = entity.end
54+
if direction == "both":
55+
start_i = max(entity.start-n, 0)
56+
5057
elif n == "sent":
51-
boundary_idx = entity.sent.end-1
58+
if direction == "right":
59+
start_i = entity.end
60+
boundary_i = entity.sent.end-1
61+
if direction == "both":
62+
start_i = entity.sent.start
63+
boundary_i = entity.sent.end-1
64+
if direction == "left":
65+
start_i = entity.sent.start
66+
boundary_i = entity.start
67+
5268
else:
5369
raise ValueError("If using pattern_match, expecting n to be an int or equal to 'sent'")
54-
first_match = next(
55-
(
56-
(self.nlp.vocab.strings[match_id], start, end)
70+
for match_id, start, end in matches:
71+
filtered_matches = [
72+
doc[start:end].text
5773
for match_id, start, end in matches
5874
if (self.nlp.vocab.strings[match_id] == "_"+entity.label_)
59-
and (start >= entity.end)
60-
and (start <= boundary_idx)
61-
)
62-
, None)
63-
64-
if first_match:
65-
return doc[first_match[1]:first_match[2]].text
66-
else:
67-
return None
75+
and (start >= start_i)
76+
and (start <= boundary_i)
77+
]
78+
return filtered_matches
79+
# if first_match:
80+
# return doc[first_match[1]:first_match[2]].text
81+
# else:
82+
# return None
6883

69-
def get_n_tokens(self, doc, entity, n, direction):
70-
"""
71-
gets first n tokens to the right or left. If token is
72-
part of named entity, the whole span is returned.
73-
If first token is punctuation or whitespace, moves to next.
74-
"""
75-
if (doc[entity.end].is_punct == True) or (doc[entity.end].is_space == True):
76-
text = self.get_whole_entity(doc, entity.end + 1, n)
77-
else:
78-
text = self.get_whole_entity(doc, entity.end, n)
84+
# def get_n_tokens(self, doc, entity, n, direction):
85+
# """
86+
# gets first n tokens to the right or left. If token is
87+
# part of named entity, the whole span is returned.
88+
# If first token is punctuation or whitespace, moves to next.
89+
# """
90+
# print(entity.text, direction, entity.start, entity.end, len(doc))
91+
# if direction == "right":
92+
# if (entity.end+1 >= len(doc)):
93+
# return None
94+
# if (doc[entity.end].is_punct == True) or (doc[entity.end].is_space == True):
95+
# text = self.get_whole_entity(doc, entity.end + 1, n, "right")
96+
# else:
97+
# text = self.get_whole_entity(doc, entity.end, n, "right")
7998

80-
return text
99+
# if direction == "left":
100+
# if entity.start == 0:
101+
# return None
102+
# if (doc[entity.start - 1].is_punct == True) or (doc[entity.start - 1].is_space == True):
103+
# text = self.get_whole_entity(doc, entity.start - 2, n, "left")
104+
# else:
105+
# text = self.get_whole_entity(doc, entity.start - 1, n, "left")
81106

82-
def get_whole_entity(self, doc, token_idx, n):
83-
"""Ensures that if a token is part of a named entity span,
84-
the whole span is returned.
85-
Span tokens count towards n tokens, however will move past
86-
n tokens if a span continues past.
87-
"""
88-
start = token_idx
89-
if doc[token_idx].ent_type_ != "":
90-
# continue to iterate if token is the beginning or inside ent
91-
# OR if haven't reached n tokens yet
92-
while (doc[token_idx].ent_iob_ in ["B", "I"]) or (token_idx - start) < n:
93-
token_idx += 1
94-
text = doc[start:token_idx].text
95-
else:
96-
text = doc[token_idx : token_idx + n].text
107+
# return text
108+
109+
# def get_whole_entity(self, doc, token_i, n, direction):
110+
# """Ensures that if a token is part of a named entity span,
111+
# the whole span is returned.
112+
# Span tokens count towards n tokens, however will move past
113+
# n tokens if a span continues past.
114+
# """
115+
# start = token_i
116+
117+
# if direction == "right":
118+
# if doc[token_i].ent_type_ != "":
119+
# # continue to iterate if token is the beginning or inside ent
120+
# # OR if haven't reached n tokens yet
121+
# while ((doc[token_i].ent_iob_ in ["B", "I"]) or ((token_i - start) < n)) and (token_i < (len(doc)-1)):
122+
# token_i += 1
123+
# text = doc[start:token_i].text
124+
# else:
125+
# text = doc[start : token_i + n].text
126+
127+
# return text
128+
129+
# if direction == "left":
130+
# if doc[token_i].ent_type_ != "":
131+
# # continue to iterate if token is the beginning or inside ent
132+
# # OR if haven't reached n tokens yet
133+
# while ((doc[token_i].ent_iob_ in ["I"]) or ((start - token_i) < n)) and (token_i > 0):
134+
# token_i -= 1
135+
# text = doc[token_i:start+1].text
136+
# else:
137+
# text = doc[token_i - n : start+1].text
97138

98-
return text
139+
# return text

extractacy/test.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,50 @@ def build_docs():
88
docs = list()
99
docs.append(
1010
(
11-
"Discharge Date: November 15, 2008. Patient had temp reading of 102.6 degrees. Insurance claim sent to patient's account on file: 1112223.",
11+
"Discharge Date: 11/15/2008. Patient had temp reading of 102.6 degrees. Insurance claim sent to patient's account on file: 1112223. 12/31/2008: Payment received.",
1212
[
13-
("Discharge Date", "November 15, 2008"),
14-
("November 15, 2008", None),
15-
("temp", "102.6 degrees"),
16-
("102.6 degrees", None),
17-
("account", "1112223"),
18-
("1112223", None),
13+
("Discharge Date", ["11/15/2008"]),
14+
("11/15/2008", []),
15+
("temp", ["102.6 degrees"]),
16+
("102.6 degrees", []),
17+
("account", ["1112223"]),
18+
("1112223", []),
19+
# ("12/31/2008", []),
20+
("Payment received", ["12/31/2008"])
21+
],
22+
)
23+
)
24+
# testing a case where algorithm attempts to go left of a document start boundary
25+
docs.append(
26+
(
27+
"Payment update: Funds deposited.",
28+
[
29+
("Payment update", []),
30+
],
31+
)
32+
)
33+
# testing a case where algorithm attempts to go right of a document end boundary
34+
docs.append(
35+
(
36+
"We do not know the discharge date",
37+
[
38+
("discharge date", []),
39+
],
40+
)
41+
)
42+
docs.append(
43+
(
44+
":Payment update: Funds deposited.",
45+
[
46+
("Payment update", []),
47+
],
48+
)
49+
)
50+
docs.append(
51+
(
52+
"We do not know the discharge date.",
53+
[
54+
("discharge date", []),
1955
],
2056
)
2157
)
@@ -33,15 +69,17 @@ def test():
3369
"pattern": [{"LOWER": "discharge"}, {"LOWER": "date"}],
3470
},
3571
{"label": "ACCOUNT", "pattern": [{"LOWER": "account"}]},
72+
{"label": "PAYMENT", "pattern": [{"LOWER": "payment"}, {"LOWER": "received"}]},
73+
{"label": "PAYMENT", "pattern": [{"LOWER": "payment"}, {"LOWER": "update"}]},
3674

3775
]
3876
ruler.add_patterns(patterns)
3977
nlp.add_pipe(ruler, last=True)
4078

4179
ent_patterns = {
42-
"DISCHARGE_DATE": {"n_tokens": {"n": 1, "direction": "right"}},
80+
"DISCHARGE_DATE": {"patterns": [[{"SHAPE": "dd/dd/dddd"}]],"n": 2, "direction": "right"},
81+
"PAYMENT": {"patterns": [[{"SHAPE": "dd/dd/dddd"}]],"n": 2, "direction": "left"},
4382
"TEMP_READING": {
44-
"pattern_match": {
4583
"patterns": [
4684
[
4785
{"LIKE_NUM": True},
@@ -54,18 +92,15 @@ def test():
5492
],
5593
"n": 7,
5694
"direction": "right"
57-
}
5895
},
5996
"ACCOUNT": {
60-
"pattern_match": {
6197
"patterns": [
6298
[
6399
{"LIKE_NUM": True, "LENGTH":{"==":7}},
64100
]
65101
],
66102
"n": "sent",
67103
"direction": "right"
68-
}
69104
},
70105
}
71106

@@ -77,7 +112,6 @@ def test():
77112
for i, e in enumerate(doc.ents):
78113
print(e.text, e._.value_extract)
79114
assert (e.text, e._.value_extract) == d[1][i]
80-
81-
115+
82116
if __name__ == "__main__":
83117
test()

0 commit comments

Comments
 (0)