Skip to content

Commit

Permalink
Copy get_sentence_ngrams, get_neighbor_sentence_ngrams, same_sentence…
Browse files Browse the repository at this point in the history
… to data_model_utils.textual

and deprecated them in data_model_utils.tabular (fix HazyResearch#503)
  • Loading branch information
Hiromu Hota committed Sep 9, 2020
1 parent cb24e08 commit 875bb8f
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 30 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"treedlib>=0.1.3, <0.2.0",
"wand>=0.4.4, <0.6.0",
"ipython",
"deprecation",
],
extras_require={
"spacy_ja": ["fugashi[unidic-lite]>=0.2.3"],
Expand Down
58 changes: 29 additions & 29 deletions src/fonduer/utils/data_model_utils/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,18 @@
from itertools import chain
from typing import DefaultDict, Iterator, List, Optional, Set, Tuple, Union

import deprecation

from fonduer import __version__
from fonduer.candidates.models import Candidate, Mention
from fonduer.candidates.models.span_mention import TemporarySpanMention
from fonduer.parser.models.sentence import Sentence
from fonduer.parser.models.table import Cell, Table
from fonduer.utils.data_model_utils.textual import get_left_ngrams, get_right_ngrams
from fonduer.utils.data_model_utils.textual import (
get_neighbor_sentence_ngrams as get_neighbor_sentence_ngrams_in_textual,
get_sentence_ngrams as get_sentence_ngrams_in_textual,
same_sentence as same_sentence_in_textual,
)
from fonduer.utils.data_model_utils.utils import _to_span, _to_spans
from fonduer.utils.utils import tokens_to_ngrams
from fonduer.utils.utils_table import (
Expand Down Expand Up @@ -79,16 +86,18 @@ def same_cell(c: Candidate) -> bool:
)


@deprecation.deprecated(
deprecated_in="0.8.3",
removed_in="0.9.0",
current_version=__version__,
details="Use :func:`textual.same_sentence()` instead",
)
def same_sentence(c: Candidate) -> bool:
"""Return True if all Mentions in the given candidate are from the same Sentence.
:param c: The candidate whose Mentions are being compared
"""
return all(
_to_span(c[i]).sentence is not None
and _to_span(c[i]).sentence == _to_span(c[0]).sentence
for i in range(len(c))
)
return same_sentence_in_textual(c)


def get_max_col_num(
Expand Down Expand Up @@ -151,6 +160,12 @@ def get_min_row_num(
return None


@deprecation.deprecated(
deprecated_in="0.8.3",
removed_in="0.9.0",
current_version=__version__,
details="Use :func:`textual.get_sentence_ngrams()` instead",
)
def get_sentence_ngrams(
mention: Union[Candidate, Mention, TemporarySpanMention],
attrib: str = "words",
Expand All @@ -169,18 +184,15 @@ def get_sentence_ngrams(
:param n_max: The maximum n of the ngrams that should be returned
:param lower: If True, all ngrams will be returned in lower case
"""
spans = _to_spans(mention)
for span in spans:
for ngram in get_left_ngrams(
span, window=100, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower
):
yield ngram
for ngram in get_right_ngrams(
span, window=100, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower
):
yield ngram
get_sentence_ngrams_in_textual(mention, attrib, n_min, n_max, lower)


@deprecation.deprecated(
deprecated_in="0.8.3",
removed_in="0.9.0",
current_version=__version__,
details="Use :func:`textual.get_neighbor_sentence_ngrams()` instead",
)
def get_neighbor_sentence_ngrams(
mention: Union[Candidate, Mention, TemporarySpanMention],
d: int = 1,
Expand All @@ -199,19 +211,7 @@ def get_neighbor_sentence_ngrams(
:param n_max: The maximum n of the ngrams that should be returned
:param lower: If True, all ngrams will be returned in lower case
"""
spans = _to_spans(mention)
for span in spans:
for ngram in chain.from_iterable(
[
tokens_to_ngrams(
getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower
)
for sentence in span.sentence.document.sentences
if abs(sentence.position - span.sentence.position) <= d
and sentence != span.sentence
]
):
yield ngram
get_neighbor_sentence_ngrams_in_textual(mention, d, attrib, n_min, n_max, lower)


def get_cell_ngrams(
Expand Down
78 changes: 77 additions & 1 deletion src/fonduer/utils/data_model_utils/textual.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
"""Fonduer textual modality utilities."""
from itertools import chain
from typing import Iterator, Union

from fonduer.candidates.models import Candidate, Mention
from fonduer.candidates.models.span_mention import TemporarySpanMention
from fonduer.utils.data_model_utils.utils import _to_span
from fonduer.utils.data_model_utils.utils import _to_span, _to_spans
from fonduer.utils.utils import tokens_to_ngrams


def same_sentence(c: Candidate) -> bool:
"""Return True if all Mentions in the given candidate are from the same Sentence.
:param c: The candidate whose Mentions are being compared
"""
return all(
_to_span(c[i]).sentence is not None
and _to_span(c[i]).sentence == _to_span(c[0]).sentence
for i in range(len(c))
)


def get_between_ngrams(
c: Candidate,
attrib: str = "words",
Expand Down Expand Up @@ -119,3 +132,66 @@ def get_right_ngrams(
lower=lower,
):
yield ngram


def get_sentence_ngrams(
mention: Union[Candidate, Mention, TemporarySpanMention],
attrib: str = "words",
n_min: int = 1,
n_max: int = 1,
lower: bool = True,
) -> Iterator[str]:
"""Get the ngrams that are in the Sentence of the given Mention, not including itself.
Note that if a candidate is passed in, all of its Mentions will be
searched.
:param mention: The Mention whose Sentence is being searched
:param attrib: The token attribute type (e.g. words, lemmas, poses)
:param n_min: The minimum n of the ngrams that should be returned
:param n_max: The maximum n of the ngrams that should be returned
:param lower: If True, all ngrams will be returned in lower case
"""
spans = _to_spans(mention)
for span in spans:
for ngram in get_left_ngrams(
span, window=100, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower
):
yield ngram
for ngram in get_right_ngrams(
span, window=100, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower
):
yield ngram


def get_neighbor_sentence_ngrams(
mention: Union[Candidate, Mention, TemporarySpanMention],
d: int = 1,
attrib: str = "words",
n_min: int = 1,
n_max: int = 1,
lower: bool = True,
) -> Iterator[str]:
"""Get the ngrams that are in the neighoring Sentences of the given Mention.
Note that if a candidate is passed in, all of its Mentions will be searched.
:param mention: The Mention whose neighbor Sentences are being searched
:param attrib: The token attribute type (e.g. words, lemmas, poses)
:param n_min: The minimum n of the ngrams that should be returned
:param n_max: The maximum n of the ngrams that should be returned
:param lower: If True, all ngrams will be returned in lower case
"""
spans = _to_spans(mention)
for span in spans:
for ngram in chain.from_iterable(
[
tokens_to_ngrams(
getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower
)
for sentence in span.sentence.document.sentences
if abs(sentence.position - span.sentence.position) <= d
and sentence != span.sentence
]
):
yield ngram

0 comments on commit 875bb8f

Please sign in to comment.