Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let get function take range annotation #769

Merged
merged 17 commits into from
May 25, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge remote-tracking branch 'origin/master' into issue694
wanglechuan-gif committed May 18, 2022
commit 874c3d94502d6b3b8eac531a71de1c56e37100bc
69 changes: 35 additions & 34 deletions forte/data/data_store.py
Original file line number Diff line number Diff line change
@@ -88,12 +88,6 @@ def __init__(
which are compulsory for annotations entries, represent the begin and
end character indices of entries in the payload.

The last field is always ``index_id`` for entries that are not
Annotation-like. It is an extra field to record the location of the
entry in the list. When the user add a new entry to the data store,
the ``index_id`` will be created and appended to the end of the original
``entry data`` list.

Here, ``type_name`` is the fully qualified name of this type represented
by ``entry list``. It must be a valid ontology defined as a class.
``tid`` is a unique id of every entry, which is internally generated by
@@ -138,10 +132,11 @@ def __init__(
"""
The ``_type_attributes`` is a private dictionary that provides
``type_name``, their parent entry, and the order of corresponding attributes.
The keys are fully qualified names of every type; The value is a dictionary with
two keys. Key ``attribute`` provides an inner dictionary with all valid attributes
for this type and the indices of attributes among these lists. Key ``parent_class``
is a string representing the ancestors of this type.
The keys are fully qualified names of every type; The value is a
dictionary with two keys. Key ``attribute`` provides an inner dictionary
with all valid attributes for this type and the indices of attributes
among these lists. Key ``parent_class`` is a string representing the
ancestors of this type.

This structure is supposed to be built dynamically. When a user adds
new entries, data_store will check unknown types and add them to
@@ -234,10 +229,11 @@ def _new_tid(self) -> int:

def _get_type_info(self, type_name: str) -> Dict[str, Any]:
"""
Get the dictionary containing type information from ``DataStore._type_attributes``.
If the ``type_name`` does not currently exists and dynamic import is enabled,
this function will add a new key-value pair into ``DataStore._type_attributes``. The
value consists of a full attribute-to-index dictionary and an empty parent set.
Get the dictionary containing type information from
``DataStore._type_attributes``. If the ``type_name`` does not currently
exists and dynamic import is enabled, this function will add a new
key-value pair into ``DataStore._type_attributes``. The value consists
of a full attribute-to-index dictionary and an empty parent set.

This function returns a dictionary containing an attribute dict and a
set of parent entries of the given type. For example:
@@ -405,8 +401,8 @@ def _is_subclass(
) -> bool:
r"""This function takes a fully qualified ``type_name`` class name,
``cls`` class and returns whether ``type_name`` class is the``cls``
subclass or not. This function accepts two types of class: the class defined
in forte, or the classes in user provided ontology file.
subclass or not. This function accept two types of class: the class
defined in forte, or the classes in user provided ontology file.

Args:
type_name: A fully qualified name of an entry class.
@@ -694,7 +690,11 @@ def _delete_entry_by_loc(self, type_name: str, index_id: int):
self.__elements.pop(type_name)
else:
target_list[index_id] = None
if len(target_list) - target_list.count(None) == 0:
if type_name in self.__deletion_count:
self.__deletion_count[type_name] += 1
else:
self.__deletion_count[type_name] = 1
if len(target_list) - self.__deletion_count[type_name] == 0:
self.__elements.pop(type_name)

def get_entry(self, tid: int) -> Tuple[List, str]:
@@ -955,24 +955,25 @@ def within_range(entry: List[Any], range_annotation: List[int]) -> bool:
yield entry
elif issubclass(entry_class, Group):
for type in all_types:
if range_annotation is None:
yield from self.__elements[type]
else:
for entry in self.__elements[type]:
member_list = entry[constants.END_INDEX]
all_within = True
for tid in member_list:
member = self.__entry_dict[tid]
if not within_range(member, range_annotation):
all_within = False
break
if all_within:
yield entry
yield from self.iter(type)
else:
raise ValueError(
f"Currently, {type_name} is not supported. "
"You may only get entries of types among Annotation, Link, Group."
)
if type_name not in self.__elements:
raise KeyError(f"type {type_name} does not exist")
yield from self.iter(type_name)

def iter(self, type_name: str) -> Iterator[List]:
r"""This function iterates all `type_name` entries. It skips None
placeholders that appear in non-annotation-like entry lists.

Args:
type_name (str): The fully qualified type name of a type.

Returns:
An iterator of the entries.
"""
for e in self.__elements[type_name]:
if e is not None:
yield e

def next_entry(self, tid: int) -> Optional[List]:
r"""Get the next entry of the same type as the ``tid`` entry.
219 changes: 45 additions & 174 deletions tests/forte/data/data_store_test.py
Original file line number Diff line number Diff line change
@@ -164,7 +164,7 @@ def setUp(self) -> None:
1234,
"ft.onto.base_ontology.Document",
None,
"Postive",
"Positive",
None,
]
ref2 = [
@@ -183,7 +183,7 @@ def setUp(self) -> None:
"ft.onto.base_ontology.Sentence",
"teacher",
1,
"Postive",
"Positive",
None,
None,
]
@@ -206,81 +206,29 @@ def setUp(self) -> None:
]

self.data_store._DataStore__elements = {
"ft.onto.base_ontology.Document": SortedList(
[
[
0,
5,
1234,
"ft.onto.base_ontology.Document",
None,
"Positive",
None,
],
[
10,
25,
3456,
"ft.onto.base_ontology.Document",
"Doc class A",
"Negative",
"Class B",
],
]
),
"ft.onto.base_ontology.Sentence": SortedList(
[
[
6,
9,
9999,
"ft.onto.base_ontology.Sentence",
"teacher",
1,
"Positive",
None,
None,
],
[
55,
70,
1234567,
"ft.onto.base_ontology.Sentence",
None,
None,
"Negative",
"Class C",
"Class D",
],
]
),
"ft.onto.base_ontology.Document": SortedList([ref1, ref2]),
"ft.onto.base_ontology.Sentence": SortedList([ref3, ref4]),
# empty list corresponds to Entry, test only
"forte.data.ontology.core.Entry": SortedList([]),
# empty list corresponds to Annotation, test only
"forte.data.ontology.top.Annotation": SortedList(
[
[
10,
20,
7654,
"forte.data.ontology.top.Annotation",
]
]
),
"forte.data.ontology.top.Annotation": SortedList([ref5]),
"forte.data.ontology.top.Group": [
[
"ft.onto.base_ontology.Sentence",
[9999, 1234567],
10123,
"forte.data.ontology.top.Group",
0,
],
[
"ft.onto.base_ontology.Document",
[1234, 3456],
23456,
"forte.data.ontology.top.Group",
1,
],
[
"forte.data.ontology.top.Annotation",
[1234, 7654],
34567,
"forte.data.ontology.top.Group",
],
],
"forte.data.ontology.top.Link": [
@@ -289,78 +237,20 @@ def setUp(self) -> None:
1234,
88888,
"forte.data.ontology.top.Link",
0,
],
],
}
self.data_store._DataStore__entry_dict = {
1234: [
0,
5,
1234,
"ft.onto.base_ontology.Document",
None,
"Positive",
None,
],
3456: [
10,
25,
3456,
"ft.onto.base_ontology.Document",
"Doc class A",
"Negative",
"Class B",
],
7654: [
10,
20,
7654,
"forte.data.ontology.top.Annotation",
],
9999: [
6,
9,
9999,
"ft.onto.base_ontology.Sentence",
"teacher",
1,
"Positive",
None,
None,
],
1234567: [
55,
70,
1234567,
"ft.onto.base_ontology.Sentence",
None,
None,
"Negative",
"Class C",
"Class D",
],
10123: [
"ft.onto.base_ontology.Sentence",
[9999, 1234567],
10123,
"forte.data.ontology.top.Group",
0,
],
23456: [
"ft.onto.base_ontology.Document",
[1234, 3456],
23456,
"forte.data.ontology.top.Group",
1,
],
88888: [
9999,
1234,
88888,
"forte.data.ontology.top.Link",
0,
],
self.data_store._DataStore__tid_ref_dict = {
1234: ref1,
3456: ref2,
9999: ref3,
1234567: ref4,
7654: ref5}
self.data_store._DataStore__tid_idx_dict = {
10123: ["forte.data.ontology.top.Group", 0],
23456: ["forte.data.ontology.top.Group", 1],
34567: ["forte.data.ontology.top.Group", 2],
88888: ["forte.data.ontology.top.Link", 0],
}

def test_get_type_info(self):
@@ -611,7 +501,7 @@ def test_add_annotation_raw(self):

self.assertEqual(num_doc, 3)
self.assertEqual(num_sent, 3)
self.assertEqual(len(self.data_store._DataStore__entry_dict), 10)
self.assertEqual(len(self.data_store._DataStore__tid_ref_dict), 7)

# test add new annotation type
self.data_store.add_annotation_raw(
@@ -624,7 +514,7 @@ def test_add_annotation_raw(self):
)
self.assertEqual(num_phrase, 1)
self.assertEqual(len(DataStore._type_attributes), 3)
self.assertEqual(len(self.data_store._DataStore__entry_dict), 11)
self.assertEqual(len(self.data_store._DataStore__tid_ref_dict), 8)

def test_get_attribute(self):
speaker = self.data_store.get_attribute(9999, "speaker")
@@ -705,41 +595,9 @@ def test_get(self):
self.assertEqual(instances[0][2], 1234)
self.assertEqual(instances[1][2], 3456)

# get document entries with range
instances = list(
self.data_store.get(
"ft.onto.base_ontology.Document", range_annotation=[0, 6]
)
)
self.assertEqual(len(instances), 1)
instances = list(
self.data_store.get(
"ft.onto.base_ontology.Document", range_annotation=[1, 6]
)
)
self.assertEqual(len(instances), 0)

# get "forte.data.ontology.core.Entry" will raise an error
with self.assertRaisesRegex(
ValueError,
"Currently, forte.data.ontology.core.Entry is not supported. "
"You may only get entries of types among Annotation, Link, Group.",
):
list(self.data_store.get("forte.data.ontology.core.Entry"))

# get annotations without subclasses
instances = list(
self.data_store.get(
"forte.data.ontology.top.Annotation", include_sub_type=False
)
)
self.assertEqual(len(instances), 1)

# get annotations with subclasses
instances = list(
self.data_store.get("forte.data.ontology.top.Annotation")
)
self.assertEqual(len(instances), 5)
# get all entries
instances = list(self.data_store.get("forte.data.ontology.core.Entry"))
self.assertEqual(len(instances), 9)

# get annotations with subclasses and range annotation
instances = list(
@@ -783,7 +641,8 @@ def test_get(self):

def test_delete_entry(self):
# delete annotation
# has a total of 8 entries
# has a total of 5 entries
self.assertEqual(len(self.data_store._DataStore__tid_ref_dict), 5)
self.data_store.delete_entry(1234567)
self.data_store.delete_entry(1234)
self.data_store.delete_entry(9999)
@@ -804,20 +663,32 @@ def test_delete_entry(self):
None
)

self.assertEqual(len(self.data_store._DataStore__entry_dict), 5)
self.assertEqual(len(self.data_store._DataStore__tid_ref_dict), 2)
self.assertEqual(num_doc, 1)
self.assertEqual(num_group, 2)

# delete group
self.data_store.delete_entry(10123)
self.assertEqual(len(self.data_store._DataStore__entry_dict), 4)
self.assertEqual(len(self.data_store._DataStore__tid_idx_dict), 3)
self.data_store.delete_entry(23456)
self.assertEqual(len(self.data_store._DataStore__entry_dict), 3)
self.data_store.delete_entry(34567)
self.assertTrue(
"forte.data.ontology.top.Group"
not in self.data_store._DataStore__elements
)

# delete link
self.assertTrue(
"forte.data.ontology.top.Link"
in self.data_store._DataStore__elements
)

self.data_store.delete_entry(88888)
self.assertTrue(
"forte.data.ontology.top.Link"
not in self.data_store._DataStore__elements
)

def test_delete_entry_nonexist(self):
# Entry tid does not exist; should raise a KeyError
with self.assertRaises(KeyError):
@@ -828,7 +699,7 @@ def test_delete_entry_by_loc(self):
"ft.onto.base_ontology.Document", 1
)
# dict entry is not deleted; only delete entry in element list
self.assertEqual(len(self.data_store._DataStore__entry_dict), 8)
self.assertEqual(len(self.data_store._DataStore__tid_ref_dict), 5)
self.assertEqual(
len(
self.data_store._DataStore__elements[
You are viewing a condensed version of this merge commit. You can view the full changes here.