add left, both direction support, bug fixes, error handling

jenojp · jenojp · commit 57d21c334d4e · 2020-03-02T21:05:14.000-05:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "/anaconda3/envs/extractacy/bin/python"
+}
diff --git a/README.md b/README.md
@@ -36,47 +36,37 @@ nlp.add_pipe(ruler, last=True)
 
 # Define ent_patterns for value extraction
 ent_patterns = {
-    "DISCHARGE_DATE": {"n_tokens": {"n": 1, "direction":"right"}},
-    "TEMP_READING": {"pattern_match": {"patterns": [
-                [{"LIKE_NUM": True},{"LOWER": {"IN": ["degrees", "farenheit", "celcius", "centigrade"]}}]],
-                "n": 3, "direction": "right"
-        }
-    },
+    "DISCHARGE_DATE": {"patterns": [[{"SHAPE": "dd/dd/dddd"}]],"n": 2, "direction": "right"},
+    "TEMP_READING": {"patterns": [[
+                        {"LIKE_NUM": True},
+                        {"LOWER": {"IN": ["f", "c", "farenheit", "celcius", "centigrade", "degrees"]}
+                        },
+                    ]
+                ],
+                "n": "sent",
+                "direction": "both"
+        },
 }
 
 valext = ValueExtractor(nlp, ent_patterns)
 nlp.add_pipe(valext, last=True)
 
-doc = nlp("Discharge Date: November 15, 2008. Patient had temp reading of 102.6 degrees.")
+doc = nlp("Discharge Date: 11/15/2008. Patient had temp reading of 102.6 degrees.")
 for e in doc.ents:
     if e._.value_extract:
         print(e.text, e.label_, e._.value_extract)
-## Discharge Date DISCHARGE_DATE November 15, 2008
+## Discharge Date DISCHARGE_DATE 11/15/2008
 ## temp reading TEMP_READING 102.6 degrees
 ```
 
 ### Value Extraction patterns
-There are two options for extracting values: n tokens and first found pattern.
-
-#### N Tokens
-This method will return n tokens past an entity of interest. 
+Returns all patterns within n tokens of entity of interest or within the same sentence. It relies on [spaCy token matching syntax](https://spacy.io/usage/rule-based-matching#matcher).
 
-**Note:**
-* if the immediate next token is whitespace or punctuation, it will be skipped.
-* if the span of n tokens is part of an entity, the entire entity will be returned, even if it is past n tokens
 ```python
-{"ENTITY_NAME": {"n_tokens": {"n": 1, "direction": "right"}}}
-```
-
-#### Pattern Match
-This method will return the first found pattern past an entity of interest within n tokens or within the same sentence. It relies on [spaCy token matching syntax](https://spacy.io/usage/rule-based-matching#matcher).
-
-```python
-{"ENTITY_NAME": {"pattern_match": {"patterns":[{"LOWER":"awesome"}, {"LOWER":"pattern"}], "n": 5, "direction": "right"}}}
+{"ENTITY_NAME":{"patterns":[{"LOWER":"awesome"}, {"LOWER":"pattern"}], "n": 5, "direction": "right"}}
 ```
 Use `"n":"sent"` for within sentence method rather than n tokens.
 
-
 ## Contributing
 [contributing](https://github.com/jenojp/negspacy/blob/master/CONTRIBUTING.md)
 
diff --git a/extractacy/extract.py b/extractacy/extract.py
@@ -1,43 +1,41 @@
 from spacy.matcher import Matcher
 from spacy.tokens import Token, Doc, Span
 
-#TODO: handle left direction
+#TODO: error handling for out of bounds token indexes (start and end of doc)
 
 class ValueExtractor(object):
     def __init__(self, nlp, ent_patterns):
 
-        Span.set_extension("value_extract", default=None, force=True)
+        Span.set_extension("value_extract", default=[], force=True)
         self.nlp = nlp
         self.ent_patterns = ent_patterns
 
         self.matcher = Matcher(nlp.vocab)
 
         for key, value in self.ent_patterns.items():
-            if "pattern_match" in value:
-                for pattern in value["pattern_match"]["patterns"]:
-                    self.matcher.add("_"+str(key), None, pattern)
-        print(len(self.matcher))
+            patterns = value["patterns"]
+            self.matcher.add("_"+str(key), patterns)
+
     def __call__(self, doc):
         """Apply the pipeline component on a Doc object and Return 
         the Doc, so it can be processed by the next component
         in the pipeline, if available.
         """
         matches = self.matcher(doc)
-        print(matches)
         for e in doc.ents:
             if e.end >= len(doc) or e.label_ not in self.ent_patterns.keys():
-                e._.value_extract = None
+                e._.value_extract = []
             else:
-                if "n_tokens" in self.ent_patterns[e.label_].keys():
-                    e._.value_extract = self.get_n_tokens(
-                        doc, e, self.ent_patterns[e.label_]["n_tokens"]["n"], 
-                        self.ent_patterns[e.label_]["n_tokens"]["direction"]
-                    )
-                if "pattern_match" in self.ent_patterns[e.label_].keys():
-                    e._.value_extract = self.get_pattern_match(
-                        doc, e, matches, self.ent_patterns[e.label_]["pattern_match"]["n"],
-                        self.ent_patterns[e.label_]["pattern_match"]["direction"]
-                    )
+                # if "n_tokens" in self.ent_patterns[e.label_].keys():
+                #     e._.value_extract = self.get_n_tokens(
+                #         doc, e, self.ent_patterns[e.label_]["n_tokens"]["n"], 
+                #         self.ent_patterns[e.label_]["n_tokens"]["direction"]
+                #       )
+                # if "pattern_match" in self.ent_patterns[e.label_].keys():
+                e._.value_extract = self.get_pattern_match(
+                    doc, e, matches, self.ent_patterns[e.label_]["n"],
+                    self.ent_patterns[e.label_]["direction"]
+                )
         return doc
 
     def get_pattern_match(self, doc, entity, matches, n, direction):
@@ -46,53 +44,96 @@ def get_pattern_match(self, doc, entity, matches, n, direction):
         in same sentence (if n == "sent")
         """
         if type(n) == int:
-            boundary_idx = entity.end+(n-1)
+            if direction == "left":
+                boundary_i = entity.start
+                start_i = max(entity.start-n, 0)
+            else:
+                boundary_i = min(entity.end+(n-1), len(doc))
+                if direction == "right":
+                    start_i = entity.end
+                if direction == "both":
+                    start_i = max(entity.start-n, 0)
+
         elif n == "sent":
-            boundary_idx = entity.sent.end-1
+            if direction == "right":
+                start_i = entity.end
+                boundary_i = entity.sent.end-1
+            if direction == "both":
+                start_i = entity.sent.start
+                boundary_i = entity.sent.end-1
+            if direction == "left":
+                start_i = entity.sent.start
+                boundary_i = entity.start
+
         else:
             raise ValueError("If using pattern_match, expecting n to be an int or equal to 'sent'")
-        first_match = next(
-            (
-            (self.nlp.vocab.strings[match_id], start, end) 
+        for match_id, start, end in matches:
+        filtered_matches = [
+            doc[start:end].text
             for match_id, start, end in matches 
             if (self.nlp.vocab.strings[match_id] == "_"+entity.label_) 
-            and (start >= entity.end)
-            and (start <= boundary_idx)
-        )
-        , None)
-            
-        if first_match:
-            return doc[first_match[1]:first_match[2]].text
-        else:
-            return None
+            and (start >= start_i)
+            and (start <= boundary_i)
+        ]
+        return filtered_matches         
+        # if first_match:
+        #     return doc[first_match[1]:first_match[2]].text
+        # else:
+        #     return None
 
-    def get_n_tokens(self, doc, entity, n, direction):
-        """
-        gets first n tokens to the right or left. If token is
-        part of named entity, the whole span is returned.
-        If first token is punctuation or whitespace, moves to next.
-        """
-        if (doc[entity.end].is_punct == True) or (doc[entity.end].is_space == True):
-            text = self.get_whole_entity(doc, entity.end + 1, n)
-        else:
-            text = self.get_whole_entity(doc, entity.end, n)
+    # def get_n_tokens(self, doc, entity, n, direction):
+    #     """
+    #     gets first n tokens to the right or left. If token is
+    #     part of named entity, the whole span is returned.
+    #     If first token is punctuation or whitespace, moves to next.
+    #     """
+    #     print(entity.text, direction, entity.start, entity.end, len(doc))
+    #     if direction == "right":
+    #         if (entity.end+1 >= len(doc)):
+    #             return None
+    #         if (doc[entity.end].is_punct == True) or (doc[entity.end].is_space == True):
+    #             text = self.get_whole_entity(doc, entity.end + 1, n, "right")
+    #         else:
+    #             text = self.get_whole_entity(doc, entity.end, n, "right")
 
-        return text
+    #     if direction == "left":
+    #         if entity.start == 0:
+    #             return None
+    #         if (doc[entity.start - 1].is_punct == True) or (doc[entity.start - 1].is_space == True):
+    #             text = self.get_whole_entity(doc, entity.start - 2, n, "left")
+    #         else:
+    #             text = self.get_whole_entity(doc, entity.start - 1, n, "left")
 
-    def get_whole_entity(self, doc, token_idx, n):
-        """Ensures that if a token is part of a named entity span, 
-        the whole span is returned.
-        Span tokens count towards n tokens, however will move past
-        n tokens if a span continues past.
-        """
-        start = token_idx
-        if doc[token_idx].ent_type_ != "":
-            # continue to iterate if token is the beginning or inside ent
-            # OR if haven't reached n tokens yet
-            while (doc[token_idx].ent_iob_ in ["B", "I"]) or (token_idx - start) < n:
-                token_idx += 1
-            text = doc[start:token_idx].text
-        else:
-            text = doc[token_idx : token_idx + n].text
+    #     return text
+
+    # def get_whole_entity(self, doc, token_i, n, direction):
+    #     """Ensures that if a token is part of a named entity span, 
+    #     the whole span is returned.
+    #     Span tokens count towards n tokens, however will move past
+    #     n tokens if a span continues past.
+    #     """
+    #     start = token_i
+
+    #     if direction == "right":
+    #         if doc[token_i].ent_type_ != "":
+    #             # continue to iterate if token is the beginning or inside ent
+    #             # OR if haven't reached n tokens yet
+    #             while ((doc[token_i].ent_iob_ in ["B", "I"]) or ((token_i - start) < n)) and (token_i < (len(doc)-1)):
+    #                 token_i += 1
+    #             text = doc[start:token_i].text
+    #         else:
+    #             text = doc[start : token_i + n].text
+
+    #         return text
+
+    #     if direction == "left":
+    #         if doc[token_i].ent_type_ != "":
+    #             # continue to iterate if token is the beginning or inside ent
+    #             # OR if haven't reached n tokens yet
+    #             while ((doc[token_i].ent_iob_ in ["I"]) or ((start - token_i) < n)) and (token_i > 0):
+    #                 token_i -= 1
+    #             text = doc[token_i:start+1].text
+    #         else:
+    #             text = doc[token_i - n : start+1].text
 
-        return text
+    #         return text
diff --git a/extractacy/test.py b/extractacy/test.py
@@ -8,14 +8,50 @@ def build_docs():
     docs = list()
     docs.append(
         (
-            "Discharge Date: November 15, 2008. Patient had temp reading of 102.6 degrees. Insurance claim sent to patient's account on file: 1112223.",
+            "Discharge Date: 11/15/2008. Patient had temp reading of 102.6 degrees. Insurance claim sent to patient's account on file: 1112223. 12/31/2008: Payment received.",
             [
-                ("Discharge Date", "November 15, 2008"),
-                ("November 15, 2008", None),
-                ("temp", "102.6 degrees"),
-                ("102.6 degrees", None),
-                ("account", "1112223"),
-                ("1112223", None),
+                ("Discharge Date", ["11/15/2008"]),
+                ("11/15/2008", []),
+                ("temp", ["102.6 degrees"]),
+                ("102.6 degrees", []),
+                ("account", ["1112223"]),
+                ("1112223", []),
+                # ("12/31/2008", []),
+                ("Payment received", ["12/31/2008"])
+            ],
+        )
+    )
+    # testing a case where algorithm attempts to go left of a document start boundary
+    docs.append(
+        (
+            "Payment update: Funds deposited.",
+            [
+                ("Payment update", []),
+            ],
+        )
+    )
+    # testing a case where algorithm attempts to go right of a document end boundary
+    docs.append(
+        (
+            "We do not know the discharge date",
+            [
+                ("discharge date", []),
+            ],
+        )
+    )
+    docs.append(
+        (
+            ":Payment update: Funds deposited.",
+            [
+                ("Payment update", []),
+            ],
+        )
+    )
+    docs.append(
+        (
+            "We do not know the discharge date.",
+            [
+                ("discharge date", []),
             ],
         )
     )
@@ -33,15 +69,17 @@ def test():
             "pattern": [{"LOWER": "discharge"}, {"LOWER": "date"}],
         },
         {"label": "ACCOUNT", "pattern": [{"LOWER": "account"}]},
+        {"label": "PAYMENT", "pattern": [{"LOWER": "payment"}, {"LOWER": "received"}]},
+        {"label": "PAYMENT", "pattern": [{"LOWER": "payment"}, {"LOWER": "update"}]},
         
     ]
     ruler.add_patterns(patterns)
     nlp.add_pipe(ruler, last=True)
 
     ent_patterns = {
-        "DISCHARGE_DATE": {"n_tokens": {"n": 1, "direction": "right"}},
+        "DISCHARGE_DATE": {"patterns": [[{"SHAPE": "dd/dd/dddd"}]],"n": 2, "direction": "right"},
+        "PAYMENT": {"patterns": [[{"SHAPE": "dd/dd/dddd"}]],"n": 2, "direction": "left"},
         "TEMP_READING": {
-            "pattern_match": {
                 "patterns": [
                     [
                         {"LIKE_NUM": True},
@@ -54,18 +92,15 @@ def test():
                 ],
                 "n": 7,
                 "direction": "right"
-            }
         },
         "ACCOUNT": {
-            "pattern_match": {
                 "patterns": [
                     [
                         {"LIKE_NUM": True, "LENGTH":{"==":7}},
                     ]
                 ],
                 "n": "sent",
                 "direction": "right"
-            }
         },
     }
 
@@ -77,7 +112,6 @@ def test():
         for i, e in enumerate(doc.ents):
             print(e.text, e._.value_extract)
             assert (e.text, e._.value_extract) == d[1][i]
-
-
+            
 if __name__ == "__main__":
     test()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "python.pythonPath": "/anaconda3/envs/extractacy/bin/python"`
	`3`	`+}`