5
5
import matplotlib .pyplot as plt
6
6
import numpy as np
7
7
import time
8
+ import ast
8
9
from typing import Dict , List , Any , Optional , Tuple
9
10
from tqdm import tqdm
10
11
11
12
from camel .verifiers .python_verifier import PythonVerifier
12
13
from camel .verifiers .models import VerificationOutcome
13
14
from camel .verifiers import MathVerifier
14
15
from physic_verifier_tem import PhysicsVerifier
16
+ from camel .extractors import BaseExtractor , BoxedStrategy
15
17
import logging
16
18
17
19
# Configuration constants
18
20
DEFAULT_MAX_WORKERS = 6
19
- DEFAULT_BATCH_SIZE = 10
21
+ DEFAULT_BATCH_SIZE = 5
20
22
DEFAULT_TIMEOUT = 60.0
21
23
DEFAULT_CONCURRENT_BATCHES = 5 # Number of batches to process concurrently
22
24
ENV_CACHE_ENABLED = True # Enable caching of virtual environments
28
30
)
29
31
30
32
31
- async def setup_verifier (required_packages : List [str ], timeout : float = 60.0 ) -> PythonVerifier :
33
+ async def setup_verifier (required_packages : List [str ], timeout : float = DEFAULT_TIMEOUT , domain : str = None ) -> PythonVerifier :
32
34
"""
33
35
Set up a Python verifier with the required packages.
34
36
35
37
Args:
36
38
required_packages: List of required packages with versions.
37
39
timeout: Timeout for code execution in seconds.
40
+ domain: The problem domain (e.g., 'mathematical_programming').
38
41
39
42
Returns:
40
43
A configured PythonVerifier instance.
41
44
"""
45
+ # Set longer timeout for Mathematical Programming domain
46
+ if domain == "mathematical_programming" :
47
+ timeout = 240.0
48
+
42
49
verifier = PythonVerifier (
43
50
timeout = timeout ,
44
51
required_packages = required_packages )
@@ -88,7 +95,7 @@ async def compare_results(execution_result: str, final_answer: str, domain: str
88
95
Args:
89
96
execution_result: The result from code execution.
90
97
final_answer: The expected final answer.
91
- domain: The problem domain (e.g., 'advanced_math ').
98
+ domain: The problem domain (e.g., 'mathematical_programming ').
92
99
93
100
Returns:
94
101
True if the results match, False otherwise.
@@ -104,6 +111,16 @@ async def compare_results(execution_result: str, final_answer: str, domain: str
104
111
return verification_result .status == VerificationOutcome .SUCCESS
105
112
except Exception :
106
113
pass
114
+ elif domain == "mathematical_programming" and execution_result is not None and final_answer is not None :
115
+ try :
116
+ math_programming_verifier = await get_math_programming_verifier ()
117
+ verification_result = await math_programming_verifier .verify (
118
+ solution = execution_result ,
119
+ reference_answer = final_answer
120
+ )
121
+ return verification_result .status == VerificationOutcome .SUCCESS
122
+ except Exception :
123
+ pass
107
124
108
125
if execution_result == final_answer :
109
126
return True
@@ -118,6 +135,8 @@ async def compare_results(execution_result: str, final_answer: str, domain: str
118
135
# Initialize physics verifier for advanced_physics domain
119
136
_physics_verifier = None
120
137
138
+ # Initialize mathematical programming verifier for mathematical_programming domain
139
+ _math_programming_verifier = None
121
140
122
141
async def get_math_verifier ():
123
142
"""
@@ -145,12 +164,34 @@ async def get_physics_verifier():
145
164
await _physics_verifier .setup (uv = True )
146
165
return _physics_verifier
147
166
148
- async def get_or_create_verifier (required_packages : List [str ]) -> Tuple [PythonVerifier , bool ]:
167
+ async def get_math_programming_verifier ():
168
+ """
169
+ Get or initialize the Mathematical Programming verifier.
170
+
171
+ Returns:
172
+ Mathematical Programming verifier
173
+ """
174
+ global _math_programming_verifier
175
+ if _math_programming_verifier is None :
176
+ # Initialize extractor
177
+ extractor = BaseExtractor ([[BoxedStrategy ()]])
178
+ await extractor .setup ()
179
+ timeout = 220.0
180
+ _math_programming_verifier = PythonVerifier (
181
+ timeout = timeout ,
182
+ required_packages = ["pyscipopt" , "pandas" , "gurobipy" , "cvxpy" , "matplotlib" , "geopy" ],
183
+ extractor = extractor
184
+ )
185
+ await _math_programming_verifier .setup (uv = True )
186
+ return _math_programming_verifier
187
+
188
+ async def get_or_create_verifier (required_packages : List [str ], domain : str = None ) -> Tuple [PythonVerifier , bool ]:
149
189
"""
150
190
Get a verifier from cache or create a new one if needed.
151
191
152
192
Args:
153
193
required_packages: List of required packages with versions.
194
+ domain: The problem domain (e.g., 'mathematical_programming').
154
195
155
196
Returns:
156
197
Tuple of (verifier, is_from_cache)
@@ -162,7 +203,7 @@ async def get_or_create_verifier(required_packages: List[str]) -> Tuple[PythonVe
162
203
return _verifier_cache [cache_key ], True
163
204
164
205
# Create a new verifier
165
- verifier = await setup_verifier (required_packages )
206
+ verifier = await setup_verifier (required_packages , domain = domain )
166
207
167
208
if ENV_CACHE_ENABLED :
168
209
_verifier_cache [cache_key ] = verifier
@@ -176,6 +217,7 @@ async def process_batch(batch: List[Tuple[int, Dict[str, Any]]], required_packag
176
217
Args:
177
218
batch: List of (index, item) tuples to process.
178
219
required_packages: List of required packages with versions.
220
+ domain: The problem domain (e.g., 'mathematical_programming').
179
221
180
222
Returns:
181
223
List of (index, result) tuples.
@@ -184,7 +226,7 @@ async def process_batch(batch: List[Tuple[int, Dict[str, Any]]], required_packag
184
226
return []
185
227
186
228
# Get or create a verifier for this batch
187
- verifier , from_cache = await get_or_create_verifier (required_packages )
229
+ verifier , from_cache = await get_or_create_verifier (required_packages , domain )
188
230
189
231
try :
190
232
# Process items concurrently within the batch
@@ -207,6 +249,7 @@ async def process_single_item(item_tuple: Tuple[int, Dict[str, Any]], verifier:
207
249
Args:
208
250
item_tuple: Tuple containing (index, item) from the dataset.
209
251
verifier: The Python verifier to use.
252
+ domain: The problem domain (e.g., 'mathematical_programming').
210
253
211
254
Returns:
212
255
Tuple of (index, result dictionary).
@@ -321,9 +364,9 @@ async def group_by_packages(items: List[Tuple[int, Dict[str, Any]]]) -> Dict[Tup
321
364
elif item .get ("metadata" , {}).get ("required_dependencies" ):
322
365
packages = item .get ("metadata" , {}).get ("required_dependencies" , [])
323
366
# Check for Mathematical Programming domain that needs pyscipopt
324
- elif item .get ("metadata" , {}).get ("domain" ) == "Mathematical_Programming " or \
367
+ elif item .get ("metadata" , {}).get ("domain" ) == "Mathematical Programming " or \
325
368
(item .get ("metadata" , {}).get ("library" ) == "SCIP" ):
326
- packages = ["pyscipopt" , "pandas" , "gurobipy" , "cvxpy" , "matplotlib" ,]
369
+ packages = ["pyscipopt" , "pandas" , "gurobipy" , "cvxpy" , "matplotlib" , "geopy" ]
327
370
328
371
# Sort packages to ensure consistent grouping
329
372
key = tuple (sorted (packages ))
@@ -415,6 +458,8 @@ async def process_dataset(
415
458
# Process each domain
416
459
logger .info (f"Processing { len (dataset )} domains..." )
417
460
for domain , items in dataset .items ():
461
+ if domain != "mathematical_programming" :
462
+ continue
418
463
domain_start_time = time .time ()
419
464
420
465
# Limit the number of samples if specified
@@ -718,6 +763,10 @@ async def main():
718
763
if _physics_verifier :
719
764
await _physics_verifier .cleanup ()
720
765
766
+ # Cleanup Mathematical Programming verifier if initialized
767
+ if _math_programming_verifier :
768
+ await _math_programming_verifier .cleanup ()
769
+
721
770
logger .info ("Processing complete." )
722
771
723
772
0 commit comments