huggingface · hynky1999 · Feb 5, 2025 · Feb 4, 2025 · Feb 5, 2025 · Feb 5, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -109,7 +109,7 @@ multilingual = [
     "jieba", # for chinese tokenizer
     "pyvi", # for vietnamese tokenizer
 ]
-math = ["latex2sympy2_extended>=0.9.3"]
+math = ["latex2sympy2_extended==1.0.4"]
 
 [project.urls]
 Homepage = "https://github.com/huggingface/lighteval"

diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py
@@ -193,6 +193,7 @@ def multilingual_extractive_match_metric(
     fallback_mode: Literal["no_fallback", "first_match"] = "first_match",
     extraction_mode: Literal["first_match", "any_match"] = "any_match",
     precision: int = 6,
+    timeout_seconds: int = 5,
 ) -> SampleLevelMetric:
     """Creates a language-aware extractive match metric that extracts answers from the model's output.
 
@@ -222,6 +223,8 @@ def multilingual_extractive_match_metric(
 
         precision: int
             Number of decimal places to use when comparing numerical values. Defaults to 6.
+        timeout_seconds: int
+            Timeout for the extraction (each attempt) and comparison. Defaults to 5.
 
     Returns:
         A sample level metric that extracts and compares mathematical expressions.
@@ -245,11 +248,12 @@ def sample_level_fn(golds: list[str], predictions: list[str], formatted_doc: Doc
         pred_extraction_regexes = get_extraction_regexes(formatted_doc, pred_extraction_target, language)
 
         extracted_predictions = [
-            extract_target_from_pred(pred, pred_extraction_regexes, fallback_mode, extraction_mode)
+            extract_target_from_pred(pred, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds)
             for pred in predictions
         ]
         extracted_golds = [
-            extract_target_from_pred(gold, gold_extraction_regexes, fallback_mode, extraction_mode) for gold in golds
+            extract_target_from_pred(gold, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds)
+            for gold in golds
         ]
 
         # Assert on empty gold and warn on empty pred
@@ -265,12 +269,19 @@ def sample_level_fn(golds: list[str], predictions: list[str], formatted_doc: Doc
         # We have to use timeout because the sypmy to str conversion can be very slow
         try:
             add_to_specifics_with_timeout(formatted_doc, extracted_predictions, extracted_golds)
-        except:  # noqa: E722
+        except Exception:  # noqa: E722
             logger.warning("Timeout when adding extracted predictions and golds to specific")
 
         return aggregation_function(
             [
-                (1.0 if any(compare_gold_target(gold, pred, precision) for gold in extracted_golds) else 0.0)
+                (
+                    1.0
+                    if any(
+                        compare_gold_target(gold, pred, precision, timeout_seconds=timeout_seconds)
+                        for gold in extracted_golds
+                    )
+                    else 0.0
+                )
                 for pred in extracted_predictions
             ]
         )