Skip to content

Commit b143b43

Browse files
committed
update changes for execute_and_compare.py. Still need to fix a few data
1 parent e2bcffd commit b143b43

File tree

8 files changed

+2930
-7352
lines changed

8 files changed

+2930
-7352
lines changed

data/detailed_domain_results/mathematical_programming_detailed.json

Lines changed: 0 additions & 1271 deletions
This file was deleted.

data/execute_and_compare.py

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,20 @@
55
import matplotlib.pyplot as plt
66
import numpy as np
77
import time
8+
import ast
89
from typing import Dict, List, Any, Optional, Tuple
910
from tqdm import tqdm
1011

1112
from camel.verifiers.python_verifier import PythonVerifier
1213
from camel.verifiers.models import VerificationOutcome
1314
from camel.verifiers import MathVerifier
1415
from physic_verifier_tem import PhysicsVerifier
16+
from camel.extractors import BaseExtractor, BoxedStrategy
1517
import logging
1618

1719
# Configuration constants
1820
DEFAULT_MAX_WORKERS = 6
19-
DEFAULT_BATCH_SIZE = 10
21+
DEFAULT_BATCH_SIZE = 5
2022
DEFAULT_TIMEOUT = 60.0
2123
DEFAULT_CONCURRENT_BATCHES = 5 # Number of batches to process concurrently
2224
ENV_CACHE_ENABLED = True # Enable caching of virtual environments
@@ -28,17 +30,22 @@
2830
)
2931

3032

31-
async def setup_verifier(required_packages: List[str], timeout: float = 60.0) -> PythonVerifier:
33+
async def setup_verifier(required_packages: List[str], timeout: float = DEFAULT_TIMEOUT, domain: str = None) -> PythonVerifier:
3234
"""
3335
Set up a Python verifier with the required packages.
3436
3537
Args:
3638
required_packages: List of required packages with versions.
3739
timeout: Timeout for code execution in seconds.
40+
domain: The problem domain (e.g., 'mathematical_programming').
3841
3942
Returns:
4043
A configured PythonVerifier instance.
4144
"""
45+
# Set longer timeout for Mathematical Programming domain
46+
if domain == "mathematical_programming":
47+
timeout = 240.0
48+
4249
verifier = PythonVerifier(
4350
timeout=timeout,
4451
required_packages=required_packages)
@@ -88,7 +95,7 @@ async def compare_results(execution_result: str, final_answer: str, domain: str
8895
Args:
8996
execution_result: The result from code execution.
9097
final_answer: The expected final answer.
91-
domain: The problem domain (e.g., 'advanced_math').
98+
domain: The problem domain (e.g., 'mathematical_programming').
9299
93100
Returns:
94101
True if the results match, False otherwise.
@@ -104,6 +111,16 @@ async def compare_results(execution_result: str, final_answer: str, domain: str
104111
return verification_result.status == VerificationOutcome.SUCCESS
105112
except Exception:
106113
pass
114+
elif domain == "mathematical_programming" and execution_result is not None and final_answer is not None:
115+
try:
116+
math_programming_verifier = await get_math_programming_verifier()
117+
verification_result = await math_programming_verifier.verify(
118+
solution=execution_result,
119+
reference_answer=final_answer
120+
)
121+
return verification_result.status == VerificationOutcome.SUCCESS
122+
except Exception:
123+
pass
107124

108125
if execution_result == final_answer:
109126
return True
@@ -118,6 +135,8 @@ async def compare_results(execution_result: str, final_answer: str, domain: str
118135
# Initialize physics verifier for advanced_physics domain
119136
_physics_verifier = None
120137

138+
# Initialize mathematical programming verifier for mathematical_programming domain
139+
_math_programming_verifier = None
121140

122141
async def get_math_verifier():
123142
"""
@@ -145,12 +164,34 @@ async def get_physics_verifier():
145164
await _physics_verifier.setup(uv=True)
146165
return _physics_verifier
147166

148-
async def get_or_create_verifier(required_packages: List[str]) -> Tuple[PythonVerifier, bool]:
167+
async def get_math_programming_verifier():
168+
"""
169+
Get or initialize the Mathematical Programming verifier.
170+
171+
Returns:
172+
Mathematical Programming verifier
173+
"""
174+
global _math_programming_verifier
175+
if _math_programming_verifier is None:
176+
# Initialize extractor
177+
extractor = BaseExtractor([[BoxedStrategy()]])
178+
await extractor.setup()
179+
timeout = 220.0
180+
_math_programming_verifier = PythonVerifier(
181+
timeout=timeout,
182+
required_packages=["pyscipopt", "pandas", "gurobipy", "cvxpy", "matplotlib", "geopy"],
183+
extractor=extractor
184+
)
185+
await _math_programming_verifier.setup(uv=True)
186+
return _math_programming_verifier
187+
188+
async def get_or_create_verifier(required_packages: List[str], domain: str = None) -> Tuple[PythonVerifier, bool]:
149189
"""
150190
Get a verifier from cache or create a new one if needed.
151191
152192
Args:
153193
required_packages: List of required packages with versions.
194+
domain: The problem domain (e.g., 'mathematical_programming').
154195
155196
Returns:
156197
Tuple of (verifier, is_from_cache)
@@ -162,7 +203,7 @@ async def get_or_create_verifier(required_packages: List[str]) -> Tuple[PythonVe
162203
return _verifier_cache[cache_key], True
163204

164205
# Create a new verifier
165-
verifier = await setup_verifier(required_packages)
206+
verifier = await setup_verifier(required_packages, domain=domain)
166207

167208
if ENV_CACHE_ENABLED:
168209
_verifier_cache[cache_key] = verifier
@@ -176,6 +217,7 @@ async def process_batch(batch: List[Tuple[int, Dict[str, Any]]], required_packag
176217
Args:
177218
batch: List of (index, item) tuples to process.
178219
required_packages: List of required packages with versions.
220+
domain: The problem domain (e.g., 'mathematical_programming').
179221
180222
Returns:
181223
List of (index, result) tuples.
@@ -184,7 +226,7 @@ async def process_batch(batch: List[Tuple[int, Dict[str, Any]]], required_packag
184226
return []
185227

186228
# Get or create a verifier for this batch
187-
verifier, from_cache = await get_or_create_verifier(required_packages)
229+
verifier, from_cache = await get_or_create_verifier(required_packages, domain)
188230

189231
try:
190232
# Process items concurrently within the batch
@@ -207,6 +249,7 @@ async def process_single_item(item_tuple: Tuple[int, Dict[str, Any]], verifier:
207249
Args:
208250
item_tuple: Tuple containing (index, item) from the dataset.
209251
verifier: The Python verifier to use.
252+
domain: The problem domain (e.g., 'mathematical_programming').
210253
211254
Returns:
212255
Tuple of (index, result dictionary).
@@ -321,9 +364,9 @@ async def group_by_packages(items: List[Tuple[int, Dict[str, Any]]]) -> Dict[Tup
321364
elif item.get("metadata", {}).get("required_dependencies"):
322365
packages = item.get("metadata", {}).get("required_dependencies", [])
323366
# Check for Mathematical Programming domain that needs pyscipopt
324-
elif item.get("metadata", {}).get("domain") == "Mathematical_Programming" or \
367+
elif item.get("metadata", {}).get("domain") == "Mathematical Programming" or \
325368
(item.get("metadata", {}).get("library") == "SCIP"):
326-
packages = ["pyscipopt", "pandas", "gurobipy", "cvxpy", "matplotlib",]
369+
packages = ["pyscipopt", "pandas", "gurobipy", "cvxpy", "matplotlib", "geopy"]
327370

328371
# Sort packages to ensure consistent grouping
329372
key = tuple(sorted(packages))
@@ -415,6 +458,8 @@ async def process_dataset(
415458
# Process each domain
416459
logger.info(f"Processing {len(dataset)} domains...")
417460
for domain, items in dataset.items():
461+
if domain != "mathematical_programming":
462+
continue
418463
domain_start_time = time.time()
419464

420465
# Limit the number of samples if specified
@@ -718,6 +763,10 @@ async def main():
718763
if _physics_verifier:
719764
await _physics_verifier.cleanup()
720765

766+
# Cleanup Mathematical Programming verifier if initialized
767+
if _math_programming_verifier:
768+
await _math_programming_verifier.cleanup()
769+
721770
logger.info("Processing complete.")
722771

723772

Loading
Loading
Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,2 @@
11
Domain,Total Samples,Execution Success Rate,Match Rate
2-
logic,85,100.0,100.0
3-
security_and_safety,516,100.0,100.0
4-
advanced_physics,429,98.6013986013986,98.6013986013986
5-
graph_discrete_math,178,100.0,96.06741573033707
6-
advanced_math,1615,100.0,78.76160990712074
7-
finance,315,58.0952380952381,25.71428571428571
8-
mathematical_programming,68,77.94117647058823,1.4705882352941175
9-
computational_biology,304,80.26315789473685,0.0
10-
games,926,100.0,0.0
2+
mathematical_programming,68,97.05882352941177,80.88235294117648

data/execution_comparison_visualizations/summary_results.md

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,12 @@
22

33
## Overall Statistics
44

5-
- **Total Samples Analyzed**: 4436
6-
- **Average Execution Success Rate**: 90.54%
7-
- **Average Match Rate**: 55.62%
5+
- **Total Samples Analyzed**: 68
6+
- **Average Execution Success Rate**: 97.06%
7+
- **Average Match Rate**: 80.88%
88

99
## Domain-Specific Results
1010

1111
| Domain | Total Samples | Execution Success Rate | Match Rate |
1212
|:-------------------------|----------------:|-------------------------:|-------------:|
13-
| logic | 85 | 100 | 100 |
14-
| security_and_safety | 516 | 100 | 100 |
15-
| advanced_physics | 429 | 98.6014 | 98.6014 |
16-
| graph_discrete_math | 178 | 100 | 96.0674 |
17-
| advanced_math | 1615 | 100 | 78.7616 |
18-
| finance | 315 | 58.0952 | 25.7143 |
19-
| mathematical_programming | 68 | 77.9412 | 1.47059 |
20-
| computational_biology | 304 | 80.2632 | 0 |
21-
| games | 926 | 100 | 0 |
13+
| mathematical_programming | 68 | 97.0588 | 80.8824 |

0 commit comments

Comments
 (0)