datajuicer
diff --git a/‎data_juicer/config/config_all.yaml‎
Lines changed: 8 additions & 0 deletions b/‎data_juicer/config/config_all.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎data_juicer/ops/mapper/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎data_juicer/ops/mapper/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -198,6 +198,14 @@ process:
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
   - expand_macro_mapper:                                    # expand macro definitions in Latex text.
+  - latex_figure_context_extractor_mapper:                  # Extract figures and their citing context from LaTeX source.
+      citation_commands: ['\ref', '\cref', '\Cref', '\autoref']  # LaTeX reference commands to search for citing paragraphs.
+      paragraph_separator: '\n\n'                               # Pattern for splitting LaTeX text into paragraphs.
+      caption_key: 'caption'                                  # Output field name for the figure caption.
+      label_key: 'label'                                      # Output field name for the LaTeX label.
+      context_key: 'citing_paragraphs'                        # Output field name for citing paragraphs.
+      parent_caption_key: 'parent_caption'                    # Output field name for the parent figure's caption (subfigures only).
+      parent_label_key: 'parent_label'                        # Output field name for the parent figure's label (for grouping subfigures).
   - extract_entity_attribute_mapper:                        # Extract attributes for given entities from the text.
       api_model: 'gpt-4o'                                     # API model name.
       query_entities: ["孙悟空", "猪八戒"]                      # Entity list to be queried.
 
@@ -47,6 +47,7 @@
 from .imgdiff_difference_caption_generator_mapper import (
     Difference_Caption_Generator_Mapper,
 )
+from .latex_figure_context_extractor_mapper import LatexFigureContextExtractorMapper
 from .mllm_mapper import MllmMapper
 from .nlpaug_en_mapper import NlpaugEnMapper
 from .nlpcda_zh_mapper import NlpcdaZhMapper
@@ -159,6 +160,7 @@
     "ImageSegmentMapper",
     "ImageTaggingMapper",
     "ImageTaggingVLMMapper",
+    "LatexFigureContextExtractorMapper",
     "MllmMapper",
     "NlpaugEnMapper",
     "NlpcdaZhMapper",