Add NER tutorial and update documentation configuration

cthoyt · cthoyt · commit ce29d90c720c · 2023-06-30T15:55:23.000+02:00
diff --git a/doc/conf.py b/doc/conf.py
@@ -68,7 +68,7 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -171,6 +171,6 @@
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
-    'https://docs.python.org/': None,
+    'python': ('https://docs.python.org/', None),
     'pyobo': ('https://pyobo.readthedocs.io/en/latest/', None),
 }
diff --git a/doc/modules/index.rst b/doc/modules/index.rst
@@ -36,6 +36,13 @@ Process
     :members:
     :show-inheritance:
 
+Named Entity Recognition
+------------------------
+.. automodule:: gilda.ner
+    :members:
+    :show-inheritance:
+
+
 Pandas Utilities
 ----------------
 .. automodule:: gilda.pandas_utils
diff --git a/doc/requirements.txt b/doc/requirements.txt
@@ -1,4 +1,4 @@
-sphinx
+sphinx<7.0
 sphinx_autodoc_typehints
 sphinx_rtd_theme
 mock
diff --git a/gilda/ner.py b/gilda/ner.py
@@ -1,9 +1,59 @@
+"""
+Gilda implements a simple dictionary-based named entity
+recognition (NER) algorithm. It can be used as follows:
+
+>>> from gilda.ner import annotate
+>>> text = "MEK phosphorylates ERK"
+>>> results = annotate(text)
+
+The results are a list of 4-tuples containing:
+- the text string matched
+- a :class:`gilda.ScoredMatch` instance containing the _best_ match
+- the position in the text string where the entity starts
+- the position in the text string where the entity ends
+
+In this example, the two concepts are grounded to FamPlex entries.
+
+>>> results[0][0], results[0][1].term.get_curie(), results[0][2], results[0][3]
+('MEK', 'fplx:MEK', 0, 3)
+>>> results[1][0], results[1][1].term.get_curie(), results[1][2], results[1][3]
+('ERK', 'fplx:ERK', 19, 22)
+
+If you directly look in the second part of the 4-tuple, you get a full
+description of the match itself:
+
+>>> results[0][1]
+ScoredMatch(Term(mek,MEK,FPLX,MEK,MEK,curated,famplex,None,None,None),\
+0.9288806431663574,Match(query=mek,ref=MEK,exact=False,space_mismatch=\
+False,dash_mismatches=set(),cap_combos=[('all_lower', 'all_caps')]))
+
+BRAT
+----
+Gilda implements a way to output annotation in a format appropriate for the
+`BRAT Rapid Annotation Tool (BRAT) <https://brat.nlplab.org/index.html>`_
+
+>>> from gilda.ner import get_brat
+>>> from pathlib import Path
+>>> brat_string = get_brat(results)
+>>> Path("results.ann").write_text(brat_string)
+>>> Path("results.txt").write_text(text)
+
+For brat to work, you need to store the text in a file with
+the extension `.txt` and the annotations in a file with the
+same name but extension `.ann`.
+"""
+
 from nltk.corpus import stopwords
 from nltk.tokenize import sent_tokenize
 
 from gilda import ScoredMatch, get_grounder
 from gilda.process import normalize
 
+__all__ = [
+    "annotate",
+    "get_brat",
+]
+
 stop_words = set(stopwords.words('english'))
 
 

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@`
`68`	`68`	`#`
`69`	`69`	`# This is also used if you do content translation via gettext catalogs.`
`70`	`70`	`# Usually you set "language" from the command line for these cases.`
`71`		`-language = None`
	`71`	`+language = "en"`
`72`	`72`
`73`	`73`	`# List of patterns, relative to source directory, that match files and`
`74`	`74`	`# directories to ignore when looking for source files.`
`@@ -171,6 +171,6 @@`
`171`	`171`
`172`	`172`	`# Example configuration for intersphinx: refer to the Python standard library.`
`173`	`173`	`intersphinx_mapping = {`
`174`		`- 'https://docs.python.org/': None,`
	`174`	`+ 'python': ('https://docs.python.org/', None),`
`175`	`175`	`'pyobo': ('https://pyobo.readthedocs.io/en/latest/', None),`
`176`	`176`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-sphinx`
	`1`	`+sphinx<7.0`
`2`	`2`	`sphinx_autodoc_typehints`
`3`	`3`	`sphinx_rtd_theme`
`4`	`4`	`mock`