Skip to content

Symbol extraction improvements: merging with scripts and font formats #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 103 additions & 20 deletions data-processing/common/parse_equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def repair_operator_tags(element: Tag) -> None:
if element.name != "mi":
return

if element.text in ["∀", "∃", "|", "∥", "."]:
if element.text in ["∀", "∃", "|", "∥", "∣", ".", "/", "%"]:
operator = clone_element(element)
operator.name = "mo"
element.replace_with(operator)
Expand Down Expand Up @@ -196,8 +196,16 @@ def merge_row_elements(element: Tag) -> None:
merged = merger.merge(elements)

# If the 'mrow' only contains one element after its children are merged, simplify the
# MathML tree replacing this node with its merged child.
# MathML tree replacing this node with its merged child. Preserve the start and end
# position of the row element if it is specified, because this often means that a styling
# macro was applied to the children, and the start and end positions of the row include
# the control sequence and braces for the styling macro.
if len(merged) == 1:
start = element.attrs.get("s2:start")
end = element.attrs.get("s2:end")
if start and end:
merged[0].attrs["s2:start"] = start
merged[0].attrs["s2:end"] = end
element.replace_with(merged[0])
else:
for e in elements:
Expand Down Expand Up @@ -656,6 +664,9 @@ def create_element(tag_name: str) -> Tag:
return BeautifulSoup("", "lxml").new_tag(tag_name)


SCRIPT_TAGS = ["msub", "musp", "msubsup"]


class MathMlElementMerger:
def merge(self, elements: List[Tag]) -> List[Tag]:
"""
Expand Down Expand Up @@ -691,10 +702,15 @@ def merge(self, elements: List[Tag]) -> List[Tag]:

def _is_mergeable_type(self, element: Tag) -> bool:
" Determine if a element is a type that is mergeable with other elements. "
MERGEABLE_TOKEN_TAGS = ["mn", "mi"]
return element.name in MERGEABLE_TOKEN_TAGS and _has_s2_offset_annotations(
element
)

if not _has_s2_offset_annotations(element):
return False

MERGEABLE_TOKEN_TAGS = ["mn", "mi", "mo", "msub", "msup", "msubsup"]
if element.name in MERGEABLE_TOKEN_TAGS:
return True

return False

def _can_merge_with_prior_elements(self, element: Tag) -> bool:
"""
Expand All @@ -714,18 +730,41 @@ def _can_merge_with_prior_elements(self, element: Tag) -> bool:
if not element_start == last_element_end:
return False

# The two elements must also have the same style (i.e., if a script letter follows a
# regular letter, than the two are probably separate identifiers).
last_style = last_element.attrs.get("mathvariant")
style = element.attrs.get("mathvariant")
if not last_style == style:
return False

# Here come the context-sensitive rules:
# 1. Letters can be merged into any sequence of elements before them that starts with a
# 1. Scripts (e.g., elements with superscripts and subscripts) can be merged into prior
# elements, provided that the base (the element to which the script is applied) can be
# merged according to the typical merging rules.
if element.name in SCRIPT_TAGS:
first_child = next(element.children, None)
if first_child:
return self._is_mergeable_type(first_child)
return False
# 2. Script end all sequences of mergeable characters. This is because no identifier is
# expected to have a superscript or a subscript in the middle.
if last_element.name in SCRIPT_TAGS:
return False
# 3. Letters can be merged into any sequence of elements before them that starts with a
# a letter. This allows tokens to be merged into (target letter is shown in
# <angled brackets> identifiers like "r2<d>2", but not constant multiplications like
# <angled brackets>) identifiers like "r2<d>2", but not constant multiplications like
# "4<x>", which should be split into two symbols.
if element.name == "mi":
return bool(self.to_merge[0].name == "mi")
# 2. Numbers can be merged into letters before them, adding to the identifier.
# 3. Numbers can be merged into numbers before them, extending an identifier, or making
# 4. Numbers can be merged into letters before them, adding to the identifier.
# 5. Numbers can be merged into numbers before them, extending an identifier, or making
# a number with multiple digits.
if element.name == "mn":
return True
return last_element.name in ["mi", "mn"]
# 6. Operators can be merged into operators that appear just before them to form multi-
# symbol operators, like '++', '//', etc.
if element.name == "mo":
return last_element.name == "mo"

return False

Expand All @@ -737,22 +776,66 @@ def _merge_prior_elements(self) -> None:
if len(self.to_merge) == 0:
return

if self.to_merge[-1].name in SCRIPT_TAGS:
element = self._merge_script(self.to_merge)
# If elements could not be merged together due to unexpected errors processing the
# script element, then keep all elements separate.
if element is None:
self.merged.extend(self.to_merge)
else:
element = self._merge_simple_row(self.to_merge)

# An identifier should have no children in MathML.
self.merged.append(element)

# Now that the prior elements have been merged, clear the list.
self.to_merge = [] # pylint: disable=attribute-defined-outside-init

def _merge_script(self, elements: List[Tag]) -> Optional[Tag]:
script_element = clone_element(elements[-1])

# Get the base to which the script is applied (e.g., 'x' in 'x^2'). The base is extracted
# recursively, as it is valid MathML to specify nested scripts. For example, x_i^2 could
# be expressed with 'x' as a base inside an 'msub' element (for 'x_i') inside an 'msup'
# element (for 'x_i^2').
base: Optional[Tag] = script_element
while base and base.name in SCRIPT_TAGS:
base = next(base.children, None)

if base is None:
return None

# Create a new base by merging in all of the simple elements appearing before the base
# with the base, and then applying the script to the merged base.
merged_base = self._merge_simple_row(elements[:-1] + [base])
base.replace_with(merged_base)

# Adjust offset of parent elements to reflect the expanded character range of the base.
parent = merged_base.parent
start = merged_base.attrs["s2:start"]
while parent:
if "s2:start" in parent.attrs:
parent.attrs["s2:start"] = start
parent = parent.parent

return script_element

def _merge_simple_row(self, elements: List[Tag]) -> Tag:
# Determine the new tag type based on the tags that will be merged. For now, we can assume
# that it's the same as the first type of element that will be merged.
tag_name = self.to_merge[0].name
tag_name = elements[0].name

# Create a new BeautifulSoup object with the contents of all identifiers appended together.
new_text = "".join([n.string for n in self.to_merge])
new_text = "".join([n.string for n in elements])
element = create_element(tag_name)
element.string = new_text
element.attrs["s2:start"] = self.to_merge[0].attrs["s2:start"]
element.attrs["s2:end"] = self.to_merge[-1].attrs["s2:end"]

# An identifier should have no children in MathML.
self.merged.append(element)
element.attrs["s2:start"] = elements[0].attrs["s2:start"]
element.attrs["s2:end"] = elements[-1].attrs["s2:end"]
mathvariant = elements[0].attrs.get("mathvariant")
if mathvariant:
element.attrs["mathvariant"] = mathvariant

# Now that the prior elements have been merged, clear the list.
self.to_merge = [] # pylint: disable=attribute-defined-outside-init
return element


def _has_s2_offset_annotations(tag: BeautifulSoup) -> bool:
Expand Down
6 changes: 6 additions & 0 deletions data-processing/tests/mathml-fragments/bold_relu.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<mrow s2:start="0" s2:end="13" s2:index="4">
<mi mathvariant="bold" s2:start="8" s2:end="9" s2:index="0">R</mi>
<mi mathvariant="bold" s2:start="9" s2:end="10" s2:index="1">e</mi>
<mi mathvariant="bold" s2:start="10" s2:end="11" s2:index="2">L</mi>
<mi mathvariant="bold" s2:start="11" s2:end="12" s2:index="3">U</mi>
</mrow>
4 changes: 4 additions & 0 deletions data-processing/tests/mathml-fragments/double_bar.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<mrow>
<mi s2:start="0" s2:end="1" s2:index="0">∣</mi>
<mi s2:start="1" s2:end="2" s2:index="1">∣</mi>
</mrow>
4 changes: 4 additions & 0 deletions data-processing/tests/mathml-fragments/script_x_regular_y.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<mrow>
<mi mathvariant="script" s2:start="0" s2:end="11" s2:index="1">X</mi>
<mi s2:start="11" s2:end="12" s2:index="2">Y</mi>
</mrow>
9 changes: 9 additions & 0 deletions data-processing/tests/mathml-fragments/word_sub_i.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<mrow>
<mi s2:start="0" s2:end="1" s2:index="0">w</mi>
<mi s2:start="1" s2:end="2" s2:index="1">o</mi>
<mi s2:start="2" s2:end="3" s2:index="2">r</mi>
<msub s2:start="3" s2:end="6" s2:index="5">
<mi s2:start="3" s2:end="4" s2:index="3">d</mi>
<mi s2:start="5" s2:end="6" s2:index="4">i</mi>
</msub>
</mrow>
29 changes: 28 additions & 1 deletion data-processing/tests/test_parse_equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_parse_single_symbol():
assert result.tokens == [Token("x", "atom", 0, 1)]


def test_merge_contiguous_symbols():
def test_merge_contiguous_identifiers():
result = parse_element(load_fragment_tag("relu.xml"))
assert str(result.element) == "<mi>ReLU</mi>"
symbol = result.symbols[0]
Expand All @@ -44,6 +44,33 @@ def test_merge_contiguous_symbols():
]


def test_merge_contiguous_styled_identifiers():
result = parse_element(load_fragment_tag("bold_relu.xml"))
assert str(result.element) == '<mi mathvariant="bold">ReLU</mi>'
symbol = result.symbols[0]
assert symbol.start == 0
assert symbol.end == 13


def test_keep_identifiers_with_different_styles_separate():
result = parse_element(load_fragment_tag("script_x_regular_y.xml"))
assert len(result.symbols) == 2
assert str(result.symbols[0].element) == '<mi mathvariant="script">X</mi>'
assert str(result.symbols[1].element) == "<mi>Y</mi>"


def test_merge_contiguous_identifiers_into_one_with_script():
result = parse_element(load_fragment_tag("word_sub_i.xml"))
symbol = result.symbols[0]
assert str(symbol.element) == "<msub><mi>word</mi><mi>i</mi></msub>"


def test_merge_contiguous_operators():
result = parse_element(load_fragment_tag("double_bar.xml"))
assert len(result.symbols) == 1
assert str(result.element) == "<mo>∣∣</mo>"


def test_merge_contigous_symbols_delimit_at_operator():
result = parse_element(load_fragment_tag("var1_plus_var2.xml"))
assert str(result.element) == "<mrow><mi>var1</mi><mo>+</mo><mi>var2</mi></mrow>"
Expand Down