Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

.json file is missing all messages when an error is thrown #672

Open
JasonGross opened this issue Oct 9, 2024 · 1 comment
Open

.json file is missing all messages when an error is thrown #672

JasonGross opened this issue Oct 9, 2024 · 1 comment

Comments

@JasonGross
Copy link
Contributor

JasonGross commented Oct 9, 2024

If an error is thrown during execution, the entire trace is lost, include debug messages. Instead, all messages up to the error, as well as all debug messages, should be included in the log. (Ideally, inspect view would also include a tab with the messages sent before the error.)

Example

# %%
from inspect_ai import Task, task
from inspect_ai.dataset import example_dataset
from inspect_ai.scorer import model_graded_fact
from inspect_ai.solver import (
    Generate,
    Solver,
    TaskState,
    generate,
    prompt_template,
    self_critique,
    solver,
)

DEFAULT_PROMPT = "{prompt}"


@solver
def error() -> Solver:
    async def error(state: TaskState, generate: Generate) -> TaskState:
        raise RuntimeError("An error occurred")

    return error


@task
def theory_of_mind():
    return Task(
        dataset=example_dataset("theory_of_mind")[:1],
        solver=[prompt_template(DEFAULT_PROMPT), generate(), error()],
        scorer=model_graded_fact(),
    )

Run

inspect eval test.py --model anthropic/claude-3-5-sonnet-20240620 --log-level debug

The log is

{
  "version": 2,
  "status": "error",
  "eval": {
    "run_id": "TZvuNq4J5AAh9xu66qi8GQ",
    "created": "2024-10-08T20:05:48-07:00",
    "task": "theory_of_mind",
    "task_id": "8w83FdHkWpKU4KkUUzzGa6",
    "task_version": 0,
    "task_file": "test.py",
    "task_attribs": {},
    "task_args": {},
    "dataset": {
      "name": "theory_of_mind",
      "location": "example://theory_of_mind",
      "samples": 1,
      "shuffled": false
    },
    "model": "anthropic/claude-3-5-sonnet-20240620",
    "model_args": {},
    "config": {},
    "revision": {
      "type": "git",
      "origin": "<redacted>",
      "commit": "<redacted>"
    },
    "packages": {
      "inspect_ai": "0.3.40"
    }
  },
  "plan": {
    "name": "plan",
    "steps": [
      {
        "solver": "prompt_template",
        "params": {
          "template": "{prompt}"
        }
      },
      {
        "solver": "generate",
        "params": {}
      },
      {
        "solver": "error",
        "params": {}
      }
    ],
    "config": {}
  },
  "stats": {
    "started_at": "2024-10-08T20:05:48-07:00",
    "completed_at": "2024-10-08T20:05:51-07:00",
    "model_usage": {
      "anthropic/claude-3-5-sonnet-20240620": {
        "input_tokens": 62,
        "output_tokens": 130,
        "total_tokens": 192
      }
    }
  },
  "error": {
    "message": "RuntimeError('An error occurred')",
    "traceback": "Traceback (most recent call last):\n\n  File \"/path/to/.venv/lib/python3.11/site-packages/inspect_ai/_eval/task/run.py\", line 260, in task_run\n    sample_results = await asyncio.gather(*sample_coroutines)\n                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n  File \"/path/to/.venv/lib/python3.11/site-packages/inspect_ai/_eval/task/run.py\", line 424, in task_run_sample\n    error = sample_error(ex)\n            ^^^^^^^^^^^^^^^^\n\n  File \"/path/to/.venv/lib/python3.11/site-packages/inspect_ai/_eval/task/error.py\", line 22, in __call__\n    raise ex\n\n  File \"/path/to/.venv/lib/python3.11/site-packages/inspect_ai/_eval/task/run.py\", line 416, in task_run_sample\n    state = await plan(state, generate)\n            ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n  File \"/path/to/.venv/lib/python3.11/site-packages/inspect_ai/solver/_plan.py\", line 105, in __call__\n    state = await solver(state, generate)\n            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n  File \"/path/to/test.py\", line 21, in error\n    raise RuntimeError(\"An error occurred\")\n\nRuntimeError: An error occurred\n",
    "traceback_ansi": "\u001b[31m┌─\u001b[0m\u001b[31m────────────────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m────────────────────────────────────────\u001b[0m\u001b[31m─┐\u001b[0m\n\u001b[31m│\u001b[0m \u001b[2;33m/path/to/.venv/lib/python3.11/site-packages/inspect_ai/_eval/task/\u001b[0m\u001b[1;33mrun.py\u001b[0m:\u001b[94m260\u001b[0m in   \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[92mtask_run\u001b[0m                                                                                                            \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m                                                                                                                     \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[2;33m/path/to/.venv/lib/python3.11/site-packages/inspect_ai/_eval/task/\u001b[0m\u001b[1;33mrun.py\u001b[0m:\u001b[94m424\u001b[0m in   \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[92mtask_run_sample\u001b[0m                                                                                                     \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m                                                                                                                     \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[2;33m/path/to/.venv/lib/python3.11/site-packages/inspect_ai/_eval/task/\u001b[0m\u001b[1;33merror.py\u001b[0m:\u001b[94m22\u001b[0m in  \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[92m__call__\u001b[0m                                                                                                            \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m                                                                                                                     \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[2;33m/path/to/.venv/lib/python3.11/site-packages/inspect_ai/_eval/task/\u001b[0m\u001b[1;33mrun.py\u001b[0m:\u001b[94m416\u001b[0m in   \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[92mtask_run_sample\u001b[0m                                                                                                     \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m                                                                                                                     \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[2;33m/path/to/.venv/lib/python3.11/site-packages/inspect_ai/solver/\u001b[0m\u001b[1;33m_plan.py\u001b[0m:\u001b[94m105\u001b[0m in     \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[92m__call__\u001b[0m                                                                                                            \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m                                                                                                                     \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[2;33m/path/to/\u001b[0m\u001b[1;33mtest.py\u001b[0m:\u001b[94m21\u001b[0m in \u001b[92merror\u001b[0m                                                      \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m                                                                                                                     \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m   \u001b[2m18 \u001b[0m\u001b[1;95m@solver\u001b[0m                                                                                                        \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m   \u001b[2m19 \u001b[0m\u001b[94mdef\u001b[0m \u001b[92merror\u001b[0m() -> Solver:                                                                                         \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m   \u001b[2m20 \u001b[0m\u001b[2m│   \u001b[0m\u001b[94masync\u001b[0m \u001b[94mdef\u001b[0m \u001b[92merror\u001b[0m(state: TaskState, generate: Generate) -> TaskState:                                        \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m \u001b[31m> \u001b[0m21 \u001b[2m│   │   \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33m\"\u001b[0m\u001b[33mAn error occurred\u001b[0m\u001b[33m\"\u001b[0m)                                                                \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m   \u001b[2m22 \u001b[0m\u001b[2m│   \u001b[0m                                                                                                           \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m   \u001b[2m23 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# return solve\u001b[0m                                                                                             \u001b[31m│\u001b[0m\n\u001b[31m│\u001b[0m   \u001b[2m24 \u001b[0m\u001b[2m│   \u001b[0m\u001b[94mreturn\u001b[0m error                                                                                               \u001b[31m│\u001b[0m\n\u001b[31m└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\u001b[0m\n\u001b[1;91mRuntimeError: \u001b[0mAn error occurred\n"
  }
}
@jjallaire-aisi
Copy link
Collaborator

Agreed! We are throwing away valuable information here. We will remedy this soon.

In the meantime, there is a --fail-on-error option you can use to have errors result in failed samples not failed evals (you can do this universally or for some number or percentage of the total: https://inspect.ai-safety-institute.org.uk/errors-and-limits.html#failure-threshold

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants