|
| 1 | +""" |
| 2 | +Gilda implements a simple dictionary-based named entity |
| 3 | +recognition (NER) algorithm. It can be used as follows: |
| 4 | +
|
| 5 | +>>> from gilda.ner import annotate |
| 6 | +>>> text = "MEK phosphorylates ERK" |
| 7 | +>>> results = annotate(text) |
| 8 | +
|
| 9 | +The results are a list of 4-tuples containing: |
| 10 | +- the text string matched |
| 11 | +- a :class:`gilda.ScoredMatch` instance containing the _best_ match |
| 12 | +- the position in the text string where the entity starts |
| 13 | +- the position in the text string where the entity ends |
| 14 | +
|
| 15 | +In this example, the two concepts are grounded to FamPlex entries. |
| 16 | +
|
| 17 | +>>> results[0][0], results[0][1].term.get_curie(), results[0][2], results[0][3] |
| 18 | +('MEK', 'fplx:MEK', 0, 3) |
| 19 | +>>> results[1][0], results[1][1].term.get_curie(), results[1][2], results[1][3] |
| 20 | +('ERK', 'fplx:ERK', 19, 22) |
| 21 | +
|
| 22 | +If you directly look in the second part of the 4-tuple, you get a full |
| 23 | +description of the match itself: |
| 24 | +
|
| 25 | +>>> results[0][1] |
| 26 | +ScoredMatch(Term(mek,MEK,FPLX,MEK,MEK,curated,famplex,None,None,None),\ |
| 27 | +0.9288806431663574,Match(query=mek,ref=MEK,exact=False,space_mismatch=\ |
| 28 | +False,dash_mismatches=set(),cap_combos=[('all_lower', 'all_caps')])) |
| 29 | +
|
| 30 | +BRAT |
| 31 | +---- |
| 32 | +Gilda implements a way to output annotation in a format appropriate for the |
| 33 | +`BRAT Rapid Annotation Tool (BRAT) <https://brat.nlplab.org/index.html>`_ |
| 34 | +
|
| 35 | +>>> from gilda.ner import get_brat |
| 36 | +>>> from pathlib import Path |
| 37 | +>>> brat_string = get_brat(results) |
| 38 | +>>> Path("results.ann").write_text(brat_string) |
| 39 | +>>> Path("results.txt").write_text(text) |
| 40 | +
|
| 41 | +For brat to work, you need to store the text in a file with |
| 42 | +the extension `.txt` and the annotations in a file with the |
| 43 | +same name but extension `.ann`. |
| 44 | +""" |
| 45 | + |
1 | 46 | from nltk.corpus import stopwords
|
2 | 47 | from nltk.tokenize import sent_tokenize
|
3 | 48 |
|
4 | 49 | from gilda import ScoredMatch, get_grounder
|
5 | 50 | from gilda.process import normalize
|
6 | 51 |
|
| 52 | +__all__ = [ |
| 53 | + "annotate", |
| 54 | + "get_brat", |
| 55 | +] |
| 56 | + |
7 | 57 | stop_words = set(stopwords.words('english'))
|
8 | 58 |
|
9 | 59 |
|
|
0 commit comments