diff --git a/data-processing/common/parse_equation.py b/data-processing/common/parse_equation.py index b6b318b9..a27babba 100644 --- a/data-processing/common/parse_equation.py +++ b/data-processing/common/parse_equation.py @@ -137,7 +137,7 @@ def repair_operator_tags(element: Tag) -> None: if element.name != "mi": return - if element.text in ["∀", "∃", "|", "∥", "."]: + if element.text in ["∀", "∃", "|", "∥", "∣", ".", "/", "%"]: operator = clone_element(element) operator.name = "mo" element.replace_with(operator) @@ -196,8 +196,16 @@ def merge_row_elements(element: Tag) -> None: merged = merger.merge(elements) # If the 'mrow' only contains one element after its children are merged, simplify the - # MathML tree replacing this node with its merged child. + # MathML tree replacing this node with its merged child. Preserve the start and end + # position of the row element if it is specified, because this often means that a styling + # macro was applied to the children, and the start and end positions of the row include + # the control sequence and braces for the styling macro. if len(merged) == 1: + start = element.attrs.get("s2:start") + end = element.attrs.get("s2:end") + if start and end: + merged[0].attrs["s2:start"] = start + merged[0].attrs["s2:end"] = end element.replace_with(merged[0]) else: for e in elements: @@ -656,6 +664,9 @@ def create_element(tag_name: str) -> Tag: return BeautifulSoup("", "lxml").new_tag(tag_name) +SCRIPT_TAGS = ["msub", "musp", "msubsup"] + + class MathMlElementMerger: def merge(self, elements: List[Tag]) -> List[Tag]: """ @@ -691,10 +702,15 @@ def merge(self, elements: List[Tag]) -> List[Tag]: def _is_mergeable_type(self, element: Tag) -> bool: " Determine if a element is a type that is mergeable with other elements. " - MERGEABLE_TOKEN_TAGS = ["mn", "mi"] - return element.name in MERGEABLE_TOKEN_TAGS and _has_s2_offset_annotations( - element - ) + + if not _has_s2_offset_annotations(element): + return False + + MERGEABLE_TOKEN_TAGS = ["mn", "mi", "mo", "msub", "msup", "msubsup"] + if element.name in MERGEABLE_TOKEN_TAGS: + return True + + return False def _can_merge_with_prior_elements(self, element: Tag) -> bool: """ @@ -714,18 +730,41 @@ def _can_merge_with_prior_elements(self, element: Tag) -> bool: if not element_start == last_element_end: return False + # The two elements must also have the same style (i.e., if a script letter follows a + # regular letter, than the two are probably separate identifiers). + last_style = last_element.attrs.get("mathvariant") + style = element.attrs.get("mathvariant") + if not last_style == style: + return False + # Here come the context-sensitive rules: - # 1. Letters can be merged into any sequence of elements before them that starts with a + # 1. Scripts (e.g., elements with superscripts and subscripts) can be merged into prior + # elements, provided that the base (the element to which the script is applied) can be + # merged according to the typical merging rules. + if element.name in SCRIPT_TAGS: + first_child = next(element.children, None) + if first_child: + return self._is_mergeable_type(first_child) + return False + # 2. Script end all sequences of mergeable characters. This is because no identifier is + # expected to have a superscript or a subscript in the middle. + if last_element.name in SCRIPT_TAGS: + return False + # 3. Letters can be merged into any sequence of elements before them that starts with a # a letter. This allows tokens to be merged into (target letter is shown in - # identifiers like "r22", but not constant multiplications like + # ) identifiers like "r22", but not constant multiplications like # "4", which should be split into two symbols. if element.name == "mi": return bool(self.to_merge[0].name == "mi") - # 2. Numbers can be merged into letters before them, adding to the identifier. - # 3. Numbers can be merged into numbers before them, extending an identifier, or making + # 4. Numbers can be merged into letters before them, adding to the identifier. + # 5. Numbers can be merged into numbers before them, extending an identifier, or making # a number with multiple digits. if element.name == "mn": - return True + return last_element.name in ["mi", "mn"] + # 6. Operators can be merged into operators that appear just before them to form multi- + # symbol operators, like '++', '//', etc. + if element.name == "mo": + return last_element.name == "mo" return False @@ -737,22 +776,66 @@ def _merge_prior_elements(self) -> None: if len(self.to_merge) == 0: return + if self.to_merge[-1].name in SCRIPT_TAGS: + element = self._merge_script(self.to_merge) + # If elements could not be merged together due to unexpected errors processing the + # script element, then keep all elements separate. + if element is None: + self.merged.extend(self.to_merge) + else: + element = self._merge_simple_row(self.to_merge) + + # An identifier should have no children in MathML. + self.merged.append(element) + + # Now that the prior elements have been merged, clear the list. + self.to_merge = [] # pylint: disable=attribute-defined-outside-init + + def _merge_script(self, elements: List[Tag]) -> Optional[Tag]: + script_element = clone_element(elements[-1]) + + # Get the base to which the script is applied (e.g., 'x' in 'x^2'). The base is extracted + # recursively, as it is valid MathML to specify nested scripts. For example, x_i^2 could + # be expressed with 'x' as a base inside an 'msub' element (for 'x_i') inside an 'msup' + # element (for 'x_i^2'). + base: Optional[Tag] = script_element + while base and base.name in SCRIPT_TAGS: + base = next(base.children, None) + + if base is None: + return None + + # Create a new base by merging in all of the simple elements appearing before the base + # with the base, and then applying the script to the merged base. + merged_base = self._merge_simple_row(elements[:-1] + [base]) + base.replace_with(merged_base) + + # Adjust offset of parent elements to reflect the expanded character range of the base. + parent = merged_base.parent + start = merged_base.attrs["s2:start"] + while parent: + if "s2:start" in parent.attrs: + parent.attrs["s2:start"] = start + parent = parent.parent + + return script_element + + def _merge_simple_row(self, elements: List[Tag]) -> Tag: # Determine the new tag type based on the tags that will be merged. For now, we can assume # that it's the same as the first type of element that will be merged. - tag_name = self.to_merge[0].name + tag_name = elements[0].name # Create a new BeautifulSoup object with the contents of all identifiers appended together. - new_text = "".join([n.string for n in self.to_merge]) + new_text = "".join([n.string for n in elements]) element = create_element(tag_name) element.string = new_text - element.attrs["s2:start"] = self.to_merge[0].attrs["s2:start"] - element.attrs["s2:end"] = self.to_merge[-1].attrs["s2:end"] - - # An identifier should have no children in MathML. - self.merged.append(element) + element.attrs["s2:start"] = elements[0].attrs["s2:start"] + element.attrs["s2:end"] = elements[-1].attrs["s2:end"] + mathvariant = elements[0].attrs.get("mathvariant") + if mathvariant: + element.attrs["mathvariant"] = mathvariant - # Now that the prior elements have been merged, clear the list. - self.to_merge = [] # pylint: disable=attribute-defined-outside-init + return element def _has_s2_offset_annotations(tag: BeautifulSoup) -> bool: diff --git a/data-processing/tests/mathml-fragments/bold_relu.xml b/data-processing/tests/mathml-fragments/bold_relu.xml new file mode 100644 index 00000000..7ce01cf6 --- /dev/null +++ b/data-processing/tests/mathml-fragments/bold_relu.xml @@ -0,0 +1,6 @@ + + R + e + L + U + \ No newline at end of file diff --git a/data-processing/tests/mathml-fragments/double_bar.xml b/data-processing/tests/mathml-fragments/double_bar.xml new file mode 100644 index 00000000..6f13a04c --- /dev/null +++ b/data-processing/tests/mathml-fragments/double_bar.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/data-processing/tests/mathml-fragments/script_x_regular_y.xml b/data-processing/tests/mathml-fragments/script_x_regular_y.xml new file mode 100644 index 00000000..fab25bb1 --- /dev/null +++ b/data-processing/tests/mathml-fragments/script_x_regular_y.xml @@ -0,0 +1,4 @@ + + X + Y + \ No newline at end of file diff --git a/data-processing/tests/mathml-fragments/word_sub_i.xml b/data-processing/tests/mathml-fragments/word_sub_i.xml new file mode 100644 index 00000000..664c0382 --- /dev/null +++ b/data-processing/tests/mathml-fragments/word_sub_i.xml @@ -0,0 +1,9 @@ + + w + o + r + + d + i + + \ No newline at end of file diff --git a/data-processing/tests/test_parse_equation.py b/data-processing/tests/test_parse_equation.py index 6c449d19..fd8bd101 100644 --- a/data-processing/tests/test_parse_equation.py +++ b/data-processing/tests/test_parse_equation.py @@ -33,7 +33,7 @@ def test_parse_single_symbol(): assert result.tokens == [Token("x", "atom", 0, 1)] -def test_merge_contiguous_symbols(): +def test_merge_contiguous_identifiers(): result = parse_element(load_fragment_tag("relu.xml")) assert str(result.element) == "ReLU" symbol = result.symbols[0] @@ -44,6 +44,33 @@ def test_merge_contiguous_symbols(): ] +def test_merge_contiguous_styled_identifiers(): + result = parse_element(load_fragment_tag("bold_relu.xml")) + assert str(result.element) == 'ReLU' + symbol = result.symbols[0] + assert symbol.start == 0 + assert symbol.end == 13 + + +def test_keep_identifiers_with_different_styles_separate(): + result = parse_element(load_fragment_tag("script_x_regular_y.xml")) + assert len(result.symbols) == 2 + assert str(result.symbols[0].element) == 'X' + assert str(result.symbols[1].element) == "Y" + + +def test_merge_contiguous_identifiers_into_one_with_script(): + result = parse_element(load_fragment_tag("word_sub_i.xml")) + symbol = result.symbols[0] + assert str(symbol.element) == "wordi" + + +def test_merge_contiguous_operators(): + result = parse_element(load_fragment_tag("double_bar.xml")) + assert len(result.symbols) == 1 + assert str(result.element) == "∣∣" + + def test_merge_contigous_symbols_delimit_at_operator(): result = parse_element(load_fragment_tag("var1_plus_var2.xml")) assert str(result.element) == "var1+var2"