dvc.lock

schema: '2.0'
stages:
  fetch_swe_bench_runs:
    cmd: python -m src.fetch_swe_bench_runs --annotations data/external/ensembled_annotations_public.csv
      --output-file data/external/swe_bench_runs.jsonl
    deps:
    - path: src/fetch_swe_bench_runs.py
      hash: md5
      md5: 78bcec8f686d817e50b4533b061fa346
      size: 9120
    outs:
    - path: data/external/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
  filter_out_partial_agents:
    cmd: python -m src.filter_out_partial_agents --input-all-runs data/external/all_runs.jsonl
      --output-runs-with-allowed-agents data/processed/runs/ga_agents.jsonl
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: src/filter_out_partial_agents.py
      hash: md5
      md5: 76e5c8cece6049529b643290e25ecddb
      size: 1106
    outs:
    - path: data/processed/runs/ga_agents.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
  filter_aird_runs:
    cmd: python -m src.filter_aird_runs --input-file data/external/all_runs.jsonl
      --output-file data/processed/runs/aird.jsonl
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: src/filter_aird_runs.py
      hash: md5
      md5: f4ae81387bdd0a2d0f0a0ce005f7b49b
      size: 950
    params:
      params.yaml:
        stages.filter_aird_runs:
          task_families:
          - ai_rd_fix_embedding
          - ai_rd_nanogpt_chat_rl
          - ai_rd_optimize_llm_foundry
          - ai_rd_restricted_mlm
          - ai_rd_rust_codecontests_inference
          - ai_rd_small_scaling_law
          - ai_rd_triton_cumsum
    outs:
    - path: data/processed/runs/aird.jsonl
      hash: md5
      md5: bd9e9ad6231d8e42391add73dc48b470
      size: 419167
  wrangle_bar_by_time_allocation:
    cmd: python -m src.wrangle.bar_by_time_allocation --runs-file data/processed/runs/aird.jsonl
      --wrangled-file data/processed/wrangled/bar_by_time_allocation.jsonl
    deps:
    - path: data/processed/runs/aird.jsonl
      hash: md5
      md5: bd9e9ad6231d8e42391add73dc48b470
      size: 419167
    - path: src/stats/statistics.py
      hash: md5
      md5: 847fb366793e66de803cd15dcc98979f
      size: 5832
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    - path: src/wrangle/bar_by_time_allocation.py
      hash: md5
      md5: 1d6abf2b30046611a9c6fbe2aabeb36b
      size: 3603
    params:
      params.yaml:
        n_bootstrap: 1000
        rebench_best_of_k_parameters:
          time_limits:
          - 36000
          max_time_limit_in_seconds: 36000
    outs:
    - path: data/processed/wrangled/bar_by_time_allocation.jsonl
      hash: md5
      md5: d65a6a890c0f3f1bd1a2f1f7ba8933fa
      size: 2101
  wrangle_human_mean_of_percentiles:
    cmd: python -m src.wrangle.human_mean_of_percentiles --interpolated-scores data/processed/runs/aird.jsonl
      --output-percentiles data/processed/wrangled/human_mean_of_percentiles.jsonl
      --log-level INFO
    deps:
    - path: data/processed/runs/aird.jsonl
      hash: md5
      md5: bd9e9ad6231d8e42391add73dc48b470
      size: 419167
    - path: src/wrangle/human_mean_of_percentiles.py
      hash: md5
      md5: d8ba4e2b646d4c16c756309eb17135ec
      size: 2958
    params:
      params.yaml:
        log_level: INFO
    outs:
    - path: data/processed/wrangled/human_mean_of_percentiles.jsonl
      hash: md5
      md5: fdb0c471fa77300828c324ddcf925610
      size: 11651
  wrangle_score_at_k:
    cmd: python -m src.wrangle.score_at_k --input-score-at-k data/processed/runs/aird.jsonl
      --output-score-at-k data/processed/wrangled/score_at_k.jsonl --n-bootstrap 1000
    deps:
    - path: data/processed/runs/aird.jsonl
      hash: md5
      md5: bd9e9ad6231d8e42391add73dc48b470
      size: 419167
    - path: src/wrangle/score_at_k.py
      hash: md5
      md5: 207acf4e38240cb8d27b5424a230cf4a
      size: 5320
    params:
      params.yaml:
        log_level: INFO
        n_bootstrap: 1000
        rebench_best_of_k_parameters:
          time_limits:
          - 36000
          max_time_limit_in_seconds: 36000
    outs:
    - path: data/processed/wrangled/score_at_k.jsonl
      hash: md5
      md5: a90888af6db4b02c5d236c100ebe1ce4
      size: 5928
  wrangle_bootstrap_logistic@equal_task_weight-ftr-0.01:
    cmd: python -m src.wrangle.bootstrap --runs-file data/external/all_runs.jsonl
      --output-bootstrap-horizons-file data/wrangled/bootstrap/equal_task_weight-ftr-0.01.csv
      --weights-col equal_task_weight --categories ftr --n-bootstrap 500 --regularization
      0.01
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: edcfd3c10cbba6e44ad4442535d93104
      size: 10441
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/bootstrap/equal_task_weight-ftr-0.01.csv
      hash: md5
      md5: ed5001468484e4874ac5dbc4fa9690e0
      size: 130705
  wrangle_logistic_regression@equal_task_weight-ftr-0.01:
    cmd: python -m src.wrangle.logistic --runs-file data/external/all_runs.jsonl --output-logistic-fits-file
      data/wrangled/logistic_regression_equal_task_weight_0.01_ftr.csv --weighting
      equal_task_weight --regularization 0.01 --bootstrap-file data/wrangled/bootstrap/equal_task_weight-ftr-0.01.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/wrangled/bootstrap/equal_task_weight-ftr-0.01.csv
      hash: md5
      md5: ed5001468484e4874ac5dbc4fa9690e0
      size: 130705
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/logistic_regression_equal_task_weight_0.01_ftr.csv
      hash: md5
      md5: 9504be42e6392018a046eaff22f29bf9
      size: 3230
  wrangle_bootstrap_logistic@equal_task_weight-ftr-0.1:
    cmd: python -m src.wrangle.bootstrap --runs-file data/external/all_runs.jsonl
      --output-bootstrap-horizons-file data/wrangled/bootstrap/equal_task_weight-ftr-0.1.csv
      --weights-col equal_task_weight --categories ftr --n-bootstrap 500 --regularization
      0.1
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: edcfd3c10cbba6e44ad4442535d93104
      size: 10441
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/bootstrap/equal_task_weight-ftr-0.1.csv
      hash: md5
      md5: 5cca15de7772c76766604a7e7b82208c
      size: 130829
  wrangle_logistic_regression@equal_task_weight-ftr-0.1:
    cmd: python -m src.wrangle.logistic --runs-file data/external/all_runs.jsonl --output-logistic-fits-file
      data/wrangled/logistic_regression_equal_task_weight_0.1_ftr.csv --weighting
      equal_task_weight --regularization 0.1 --bootstrap-file data/wrangled/bootstrap/equal_task_weight-ftr-0.1.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/wrangled/bootstrap/equal_task_weight-ftr-0.1.csv
      hash: md5
      md5: 5cca15de7772c76766604a7e7b82208c
      size: 130829
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/logistic_regression_equal_task_weight_0.1_ftr.csv
      hash: md5
      md5: 12984572cecbb05eca60763f1b8c473a
      size: 3238
  wrangle_bootstrap_logistic@invsqrt_task_weight-ftr-0.01:
    cmd: python -m src.wrangle.bootstrap --runs-file data/external/all_runs.jsonl
      --output-bootstrap-horizons-file data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.01.csv
      --weights-col invsqrt_task_weight --categories ftr --n-bootstrap 500 --regularization
      0.01
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: edcfd3c10cbba6e44ad4442535d93104
      size: 10441
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.01.csv
      hash: md5
      md5: ab8f528eb621ec3ad61aceab364ca292
      size: 130642
  wrangle_logistic_regression@invsqrt_task_weight-ftr-0.01:
    cmd: python -m src.wrangle.logistic --runs-file data/external/all_runs.jsonl --output-logistic-fits-file
      data/wrangled/logistic_regression_invsqrt_task_weight_0.01_ftr.csv --weighting
      invsqrt_task_weight --regularization 0.01 --bootstrap-file data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.01.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.01.csv
      hash: md5
      md5: ab8f528eb621ec3ad61aceab364ca292
      size: 130642
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.01_ftr.csv
      hash: md5
      md5: 3527e6fc49ca4bf98900293088e4ebf6
      size: 3237
  wrangle_bootstrap_logistic@invsqrt_task_weight-ftr-0.1:
    cmd: python -m src.wrangle.bootstrap --runs-file data/external/all_runs.jsonl
      --output-bootstrap-horizons-file data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.1.csv
      --weights-col invsqrt_task_weight --categories ftr --n-bootstrap 500 --regularization
      0.1
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: edcfd3c10cbba6e44ad4442535d93104
      size: 10441
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.1.csv
      hash: md5
      md5: 5b0bbb5b487a3b75578289f6ad5418e3
      size: 130847
  wrangle_logistic_regression@invsqrt_task_weight-ftr-0.1:
    cmd: python -m src.wrangle.logistic --runs-file data/external/all_runs.jsonl --output-logistic-fits-file
      data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv --weighting
      invsqrt_task_weight --regularization 0.1 --bootstrap-file data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.1.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.1.csv
      hash: md5
      md5: 5b0bbb5b487a3b75578289f6ad5418e3
      size: 130847
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
  wrangle_bootstrap_logistic_swe_bench@invsqrt_task_weight-ftr-0.1:
    cmd: python -m src.wrangle.bootstrap --runs-file data/processed/swe_bench_runs.jsonl
      --output-bootstrap-horizons-file data/wrangled/bootstrap/swe_bench_runs_bootstrap.csv
      --weights-col invsqrt_task_weight --categories ftr --n-bootstrap 500 --regularization
      0.1
    deps:
    - path: data/processed/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: edcfd3c10cbba6e44ad4442535d93104
      size: 10441
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/bootstrap/swe_bench_runs_bootstrap.csv
      hash: md5
      md5: 3dbf1b003dbe1cab98fd2d9e8e62cd3e
      size: 55577
  wrangle_logistic_regression_swe_bench@invsqrt_task_weight-0.1:
    cmd: python -m src.wrangle.logistic --runs-file data/processed/swe_bench_runs.jsonl
      --output-logistic-fits-file data/wrangled/swe_bench_logistic.csv --weighting
      invsqrt_task_weight --regularization 0.1 --bootstrap-file data/wrangled/bootstrap/swe_bench_runs_bootstrap.csv
    deps:
    - path: data/processed/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: data/wrangled/bootstrap/swe_bench_runs_bootstrap.csv
      hash: md5
      md5: 3dbf1b003dbe1cab98fd2d9e8e62cd3e
      size: 55577
    - path: src/wrangle/logistic.py
      hash: md5
      md5: dd5b095413f86de74d055f81921b595c
      size: 6845
    outs:
    - path: data/wrangled/swe_bench_logistic.csv
      hash: md5
      md5: c3bdc96a6dc296d5380c118cc17f26e4
      size: 1402
  plot_bar_chart@invsqrt_task_weight-None-None-0.1:
    cmd: python -m src.plot.bar_chart --metrics-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --output-file plots/bar_chart/invsqrt_task_weight.png --log-level INFO --weighting
      invsqrt_task_weight --boot-set None --pass-at-k-sampling None --params params.yaml:plots
    deps:
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: src/plot/bar_chart.py
      hash: md5
      md5: 629e8e84d6e0a50cd093e43cb6f26b26
      size: 6814
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
    outs:
    - path: plots/bar_chart/invsqrt_task_weight.png
      hash: md5
      md5: 1eb5cb9a2754b0860db9b591090c15d3
      size: 23478
  plot_bar_by_time_allocation:
    cmd: python -m src.plot.bar_by_time_allocation --input data/processed/wrangled/bar_by_time_allocation.jsonl
      --output plots/bar_by_time_allocation.png --log-level INFO
    deps:
    - path: data/processed/wrangled/bar_by_time_allocation.jsonl
      hash: md5
      md5: d65a6a890c0f3f1bd1a2f1f7ba8933fa
      size: 2101
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/bar_by_time_allocation.py
      hash: md5
      md5: 5df7d6c33dd0bb200e487233a6b8b57f
      size: 5661
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/bar_by_time_allocation.png
      hash: md5
      md5: 81843283e3e3e7e69d1025b0d7c7f5cf
      size: 83731
  plot_score_at_k@36000:
    cmd: python -m src.plot.score_at_k --input-score-at-k data/processed/wrangled/score_at_k.jsonl
      --input-human-mean-of-percentiles data/processed/wrangled/human_mean_of_percentiles.jsonl
      --output-prefix plots/aird/score_at_k --time-limit 36000 --log-level INFO
    deps:
    - path: data/processed/wrangled/human_mean_of_percentiles.jsonl
      hash: md5
      md5: fdb0c471fa77300828c324ddcf925610
      size: 11651
    - path: data/processed/wrangled/score_at_k.jsonl
      hash: md5
      md5: a90888af6db4b02c5d236c100ebe1ce4
      size: 5928
    - path: src/plot/score_at_k.py
      hash: md5
      md5: a35128ac40e35de4363a51881aea1245
      size: 7003
    - path: src/stats/statistics.py
      hash: md5
      md5: 847fb366793e66de803cd15dcc98979f
      size: 5832
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
        rebench_best_of_k_parameters:
          time_limits:
          - 36000
          max_time_limit_in_seconds: 36000
    outs:
    - path: plots/aird/score_at_k_36000.png
      hash: md5
      md5: eb9b2067e716d9c6ba79c4115ccf783f
      size: 150506
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-true-full-2024-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-true-ftr-2024-01-01-distr_full.png
      --log-level INFO --trendlines true --after-date 2024-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution full
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-true-ftr-2024-01-01-distr_full.png
      hash: md5
      md5: 1425dfc31c18c48777663903597a9d3a
      size: 106074
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-true-full-2023-03-13:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-true-ftr-2023-03-13-distr_full.png
      --log-level INFO --trendlines true --after-date 2023-03-13 --weighting "invsqrt_task_weight"
      --include-task-distribution full
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 14a6f449504ad678b4be46430ac64461
      size: 6196702
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: 4cc83cf3e16977b197479ff812176923
      size: 3133
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: 97f205c6c3976ed03f82ba854c886232
      size: 14366
    - path: src/utils/plots.py
      hash: md5
      md5: 2281a64c4bcb5d3207ce05911784d91c
      size: 3367
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-true-ftr-2023-03-13-distr_full.png
      hash: md5
      md5: 2c8ba3ed41eb9f2fd717cce86b1b0a31
      size: 136256
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-true-none-2024-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-true-ftr-2024-01-01-distr_none.png
      --log-level INFO --trendlines true --after-date 2024-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution none
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-true-ftr-2024-01-01-distr_none.png
      hash: md5
      md5: fa45d964b0171f41b9d76412983942b6
      size: 83145
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-true-none-2023-03-13:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-true-ftr-2023-03-13-distr_none.png
      --log-level INFO --trendlines true --after-date 2023-03-13 --weighting "invsqrt_task_weight"
      --include-task-distribution none
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 14a6f449504ad678b4be46430ac64461
      size: 6196702
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: 4cc83cf3e16977b197479ff812176923
      size: 3133
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: 97f205c6c3976ed03f82ba854c886232
      size: 14366
    - path: src/utils/plots.py
      hash: md5
      md5: 2281a64c4bcb5d3207ce05911784d91c
      size: 3367
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-true-ftr-2023-03-13-distr_none.png
      hash: md5
      md5: f71bd1da47250201176aa884065a4ed7
      size: 112955
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-false-full-2024-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-false-ftr-2024-01-01-distr_full.png
      --log-level INFO --trendlines false --after-date 2024-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution full
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-false-ftr-2024-01-01-distr_full.png
      hash: md5
      md5: d80e63eec2989cd415fe1dbe4718ae04
      size: 118193
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-false-full-2023-03-13:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-false-ftr-2023-03-13-distr_full.png
      --log-level INFO --trendlines false --after-date 2023-03-13 --weighting "invsqrt_task_weight"
      --include-task-distribution full
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 14a6f449504ad678b4be46430ac64461
      size: 6196702
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: 4cc83cf3e16977b197479ff812176923
      size: 3133
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: 97f205c6c3976ed03f82ba854c886232
      size: 14366
    - path: src/utils/plots.py
      hash: md5
      md5: 2281a64c4bcb5d3207ce05911784d91c
      size: 3367
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-false-ftr-2023-03-13-distr_full.png
      hash: md5
      md5: 1b4e65b86211f5a6a12d69e8e3bd86cc
      size: 127058
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-false-none-2024-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-false-ftr-2024-01-01-distr_none.png
      --log-level INFO --trendlines false --after-date 2024-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution none
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-false-ftr-2024-01-01-distr_none.png
      hash: md5
      md5: ab1e0a67420c17475247637a08a22e5a
      size: 95415
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-false-none-2023-03-13:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-false-ftr-2023-03-13-distr_none.png
      --log-level INFO --trendlines false --after-date 2023-03-13 --weighting "invsqrt_task_weight"
      --include-task-distribution none
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 14a6f449504ad678b4be46430ac64461
      size: 6196702
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: 4cc83cf3e16977b197479ff812176923
      size: 3133
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: 97f205c6c3976ed03f82ba854c886232
      size: 14366
    - path: src/utils/plots.py
      hash: md5
      md5: 2281a64c4bcb5d3207ce05911784d91c
      size: 3367
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-false-ftr-2023-03-13-distr_none.png
      hash: md5
      md5: 5a5287de9fd6750d3d287f900472b520
      size: 101249
  plot_logistic_regression_swe_bench@invsqrt_task_weight-ftr-0.1-true-2024-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/swe_bench_logistic.csv
      --runs-file data/processed/swe_bench_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/swe_bench/swe_bench_invsqrt_task_weight-0.1-true_2024-01-01.png
      --log-level INFO --trendlines true --after-date 2024-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution=false
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/processed/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: data/wrangled/swe_bench_logistic.csv
      hash: md5
      md5: c3bdc96a6dc296d5380c118cc17f26e4
      size: 1402
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/swe_bench/swe_bench_invsqrt_task_weight-0.1-true_2024-01-01.png
      hash: md5
      md5: 617328556b767a35360bc7e38c93a562
      size: 110456
  plot_logistic_regression_swe_bench@invsqrt_task_weight-ftr-0.1-true-2023-03-13:
    cmd: python -m src.plot.logistic --input-file data/wrangled/swe_bench_logistic.csv
      --runs-file data/processed/swe_bench_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/swe_bench/swe_bench_invsqrt_task_weight-0.1-true_2023-03-13.png
      --log-level INFO --trendlines true --after-date 2023-03-13 --weighting "invsqrt_task_weight"
      --include-task-distribution=false
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/processed/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: data/wrangled/swe_bench_logistic.csv
      hash: md5
      md5: c3bdc96a6dc296d5380c118cc17f26e4
      size: 1402
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/swe_bench/swe_bench_invsqrt_task_weight-0.1-true_2023-03-13.png
      hash: md5
      md5: 4b294d4be01510d700b45b60a0a0c2e4
      size: 130103
  plot_logistic_regression_swe_bench@invsqrt_task_weight-ftr-0.1-false-2024-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/swe_bench_logistic.csv
      --runs-file data/processed/swe_bench_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/swe_bench/swe_bench_invsqrt_task_weight-0.1-false_2024-01-01.png
      --log-level INFO --trendlines false --after-date 2024-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution=false
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/processed/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: data/wrangled/swe_bench_logistic.csv
      hash: md5
      md5: c3bdc96a6dc296d5380c118cc17f26e4
      size: 1402
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/swe_bench/swe_bench_invsqrt_task_weight-0.1-false_2024-01-01.png
      hash: md5
      md5: 6fecaf9d2cc6c16922260884a0d62f01
      size: 113299
  plot_logistic_regression_swe_bench@invsqrt_task_weight-ftr-0.1-false-2023-03-13:
    cmd: python -m src.plot.logistic --input-file data/wrangled/swe_bench_logistic.csv
      --runs-file data/processed/swe_bench_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/swe_bench/swe_bench_invsqrt_task_weight-0.1-false_2023-03-13.png
      --log-level INFO --trendlines false --after-date 2023-03-13 --weighting "invsqrt_task_weight"
      --include-task-distribution=false
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/processed/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: data/wrangled/swe_bench_logistic.csv
      hash: md5
      md5: c3bdc96a6dc296d5380c118cc17f26e4
      size: 1402
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/swe_bench/swe_bench_invsqrt_task_weight-0.1-false_2023-03-13.png
      hash: md5
      md5: 475463306943adbfd602532c6e9b85ee
      size: 112237
  plot_logistic_multiverse:
    cmd: python -m src.plot.logistic_multiverse --runs-file data/external/all_runs.jsonl
      --release-dates data/external/release_dates.yaml --temp-dir data/temp/logistic_multiverse
      --output-file plots/logistic/logistic_multiverse.png --log-level INFO --output-metrics-file
      metrics/horizon_trend/multiverse.yaml
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: db301efa7b0ee4e1bc2f424efacc8e1c
      size: 570
    - path: data/wrangled/
      hash: md5
      md5: 9b3da2b8c879018c0b257ec9294a7621.dir
      size: 506269
      nfiles: 13
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: bd4dc142d9d464770e93c429f37a33f9
      size: 23402
    - path: src/plot/logistic_multiverse.py
      hash: md5
      md5: 547117c511864ddb5e40ff3e9fbb4bfc
      size: 6529
    params:
      params.yaml:
        figs.plot_logistic_multiverse:
          exclude_agents:
          - GPT-4 0125
          weightings:
          - equal_task_weight
          - invsqrt_task_weight
          regularizations:
          - 0.1
          - 0.05
          - 0.02
          - 0.01
          categories: ftr
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: metrics/horizon_trend/multiverse.yaml
      hash: md5
      md5: 6186383db801067506358c436b3ca95e
      size: 429
    - path: plots/logistic/logistic_multiverse.png
      hash: md5
      md5: e9a6c009ab9b4ab338bb73c1fc49e03f
      size: 180572
  plot_logistic_individual@invsqrt_task_weight-0.01:
    cmd: python -m src.plot.logistic_individual --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.01_ftr.csv
      --output-file plots/logistic_individual/invsqrt_task_weight-0.01.png --log-level
      INFO
    deps:
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.01_ftr.csv
      hash: md5
      md5: 3527e6fc49ca4bf98900293088e4ebf6
      size: 3237
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic_individual.py
      hash: md5
      md5: d25042f331714c8024ddde975f26abfb
      size: 4103
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
    outs:
    - path: plots/logistic_individual/invsqrt_task_weight-0.01.png
      hash: md5
      md5: 656c482723500cbee163b3cd7dbc05f3
      size: 154588
  plot_logistic_individual@invsqrt_task_weight-0.1:
    cmd: python -m src.plot.logistic_individual --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --output-file plots/logistic_individual/invsqrt_task_weight-0.1.png --log-level
      INFO
    deps:
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic_individual.py
      hash: md5
      md5: d25042f331714c8024ddde975f26abfb
      size: 4103
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
    outs:
    - path: plots/logistic_individual/invsqrt_task_weight-0.1.png
      hash: md5
      md5: a7b65ba138976801b5c5671c85667a85
      size: 154739
  plot_logistic_individual_swe_bench@invsqrt_task_weight-0.1:
    cmd: python -m src.plot.logistic_individual --input-file data/wrangled/swe_bench_logistic.csv
      --output-file plots/logistic_individual/swe_bench/swe_bench_invsqrt_task_weight-0.1.png
      --log-level INFO
    deps:
    - path: data/wrangled/swe_bench_logistic.csv
      hash: md5
      md5: c3bdc96a6dc296d5380c118cc17f26e4
      size: 1402
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic_individual.py
      hash: md5
      md5: d25042f331714c8024ddde975f26abfb
      size: 4103
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
    outs:
    - path: plots/logistic_individual/swe_bench/swe_bench_invsqrt_task_weight-0.1.png
      hash: md5
      md5: c8ae30a1a2f9e0d8c125cb5e7cff7d4e
      size: 107479
  plot_bootstrap_ci@invsqrt_task_weight-ftr-2023-03-13-0.1:
    cmd: python -m src.plot.bootstrap_ci --input-file data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.1.csv
      --release-dates data/external/release_dates.yaml --output-file plots/bootstrap/invsqrt_task_weight-ftr-2023-03-13-0.1.png
      --weighting invsqrt_task_weight --categories ftr --regularization 0.1 --log-level
      INFO --n-samples 60 --after-date 2023-03-13
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.1.csv
      hash: md5
      md5: 5b0bbb5b487a3b75578289f6ad5418e3
      size: 130847
    - path: src/plot/bootstrap_ci.py
      hash: md5
      md5: 45a52f60707d72c4b6a89f9fd9916ac5
      size: 7947
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
    outs:
    - path: plots/bootstrap/invsqrt_task_weight-ftr-2023-03-13-0.1.png
      hash: md5
      md5: a7ed845c11fa18d72595dc8e320a875f
      size: 250832
  plot_bootstrap_ci@invsqrt_task_weight-ftr-2024-01-01-0.1:
    cmd: python -m src.plot.bootstrap_ci --input-file data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.1.csv
      --release-dates data/external/release_dates.yaml --output-file plots/bootstrap/invsqrt_task_weight-ftr-2024-01-01-0.1.png
      --weighting invsqrt_task_weight --categories ftr --regularization 0.1 --log-level
      INFO --n-samples 60 --after-date 2024-01-01
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/bootstrap/invsqrt_task_weight-ftr-0.1.csv
      hash: md5
      md5: 5b0bbb5b487a3b75578289f6ad5418e3
      size: 130847
    - path: src/plot/bootstrap_ci.py
      hash: md5
      md5: 45a52f60707d72c4b6a89f9fd9916ac5
      size: 7947
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
    outs:
    - path: plots/bootstrap/invsqrt_task_weight-ftr-2024-01-01-0.1.png
      hash: md5
      md5: 9cd82a71e44dfb0d4ebf073f4d78a7af
      size: 244631
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-true-full-2019-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-true-ftr-2019-01-01-distr_full.png
      --log-level INFO --trendlines true --after-date 2019-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution full
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-true-ftr-2019-01-01-distr_full.png
      hash: md5
      md5: c43c9bb619b026e73bd7750f8486b56c
      size: 137608
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-true-none-2019-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-true-ftr-2019-01-01-distr_none.png
      --log-level INFO --trendlines true --after-date 2019-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution none
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-true-ftr-2019-01-01-distr_none.png
      hash: md5
      md5: 7e0cfb80ea79f5207455295c8adbea22
      size: 115064
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-false-full-2019-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-false-ftr-2019-01-01-distr_full.png
      --log-level INFO --trendlines false --after-date 2019-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution full
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-false-ftr-2019-01-01-distr_full.png
      hash: md5
      md5: be60433d21a206df12c18a018c929ef6
      size: 142178
  plot_logistic_regression@invsqrt_task_weight-ftr-0.1-false-none-2019-01-01:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/invsqrt_task_weight-0.1-false-ftr-2019-01-01-distr_none.png
      --log-level INFO --trendlines false --after-date 2019-01-01 --weighting "invsqrt_task_weight"
      --include-task-distribution none
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 557c9500e054e832ecd77cd274d94f5d
      size: 6194396
    - path: data/external/release_dates.yaml
      hash: md5
      md5: caa7031aaaa86f4ef7298052b3b5a486
      size: 511
    - path: data/wrangled/logistic_regression_invsqrt_task_weight_0.1_ftr.csv
      hash: md5
      md5: baa4c86746abccf14ffe55df1acf773e
      size: 3238
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: b78668949286c250b467f035aa8d3f95
      size: 14149
    - path: src/utils/plots.py
      hash: md5
      md5: 6a051cd7d403e5664969c1ca2ab468ef
      size: 3403
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          colors:
            agent_aliases:
              Claude 3.5 Sonnet (New):
                light: '#B784ED'
                base: '#8B4DC9'
                dark: '#5F2B94'
              Claude 3.5 Sonnet (Old):
                light: '#D4B6F2'
                base: '#9B6BE0'
                dark: '#6B3DB0'
              Claude 3 Opus:
                light: '#E5D4F7'
                base: '#B594E8'
                dark: '#7A5BA6'
              GPT-4o:
                light: '#45B3D6'
                base: '#2B8FB0'
                dark: '#1A5668'
              GPT-4 Turbo:
                light: '#7CC3DB'
                base: '#4A9CBD'
                dark: '#2C6B8F'
              GPT-4 0314:
                light: '#ADD8E6'
                base: '#87CEEB'
                dark: '#4682B4'
              davinci-002:
                light: '#E0F3FF'
                base: '#B3E0FF'
                dark: '#80C4FF'
              gpt-3.5-turbo-instruct:
                light: '#F0F8FF'
                base: '#CCE6FF'
                dark: '#99CCFF'
              o1:
                light: '#90EE90'
                base: '#228B22'
                dark: '#006400'
              o1-preview:
                light: '#98FB98'
                base: '#3CB371'
                dark: '#2E8B57'
              human:
                light: '#c9c9c9'
                base: '#858585'
                dark: '#484848'
            default: '#000000'
          legend_order:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4o
          - davinci
          - gpt2
          - o1-preview
          - o1
          - Human 8-hour score
        weighting:
        - weight_col: equal_task_weight
          graph_snippet: Equally weighted tasks
        - weight_col: invsqrt_task_weight
          graph_snippet: Tasks diversity-weighted (1/sqrt(n))
        - weight_col:
          graph_snippet: None
    outs:
    - path: plots/logistic/invsqrt_task_weight-0.1-false-ftr-2019-01-01-distr_none.png
      hash: md5
      md5: 1f358e11818ae00cd22419575c751674
      size: 119206
  calculate_baseline_statistics:
    cmd: python -m src.calculate_baseline_statistics --input-file data/external/all_runs.jsonl
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: src/calculate_baseline_statistics.py
      hash: md5
      md5: 09e038a80f8bc883780c914246d979fa
      size: 4300
    outs:
    - path: metrics/baseline_statistics.yaml
      hash: md5
      md5: 238e5d4dbb07bb3d666b5fc5d79b7ca7
      size: 390
  wrangle_bootstrap_logistic@0:
    cmd: python -m src.wrangle.bootstrap --fig-name single_line_2023_ga_rebench --runs-file
      data/external/all_runs.jsonl --output-bootstrap-horizons-file data/wrangled/bootstrap/single_line_2023_ga_rebench.csv
      --n-bootstrap 500
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b075c912c1c344b15ba1424bc37c4a3a
      size: 9706149
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: 5f9f118399ef508188bae970e1f2014c
      size: 10815
    - path: src/wrangle/logistic.py
      hash: md5
      md5: 983ed2e73e116c6dc8f1d465a810bad7
      size: 7053
    params:
      params.yaml:
        plots.plot_logistic_regression.single_line_2023_ga_rebench:
          logistic_file_string: single_line_2023_ga_rebench
          parameter_group_name: single_line_2023_ga_rebench
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2023-01-01'
            color: blue
            line_start_date: '2023-01-01'
            line_end_date: '2025-04-01'
            display_r_squared: true
            data_file:
            styling:
              linewidth: 2
              alpha: 0.5
              linestyle: dashed
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2022-12-01'
          x_lim_end: '2025-04-01'
          lower_y_lim: 0.5
          upper_y_lim: 480
          exclude:
          - SWAA
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/bootstrap/single_line_2023_ga_rebench.csv
      hash: md5
      md5: 0fb40703aa1a158ed507328198de5298
      size: 111483
  wrangle_bootstrap_logistic@1:
    cmd: python -m src.wrangle.bootstrap --fig-name double_line_all_data --runs-file
      data/external/all_runs.jsonl --output-bootstrap-horizons-file data/wrangled/bootstrap/double_line_all_data.csv
      --n-bootstrap 500
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b075c912c1c344b15ba1424bc37c4a3a
      size: 9706149
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: 5f9f118399ef508188bae970e1f2014c
      size: 10815
    - path: src/wrangle/logistic.py
      hash: md5
      md5: 983ed2e73e116c6dc8f1d465a810bad7
      size: 7053
    params:
      params.yaml:
        plots.plot_logistic_regression.double_line_all_data:
          logistic_file_string: double_line_all_data
          parameter_group_name: double_line_all_data
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            data_file:
            styling:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2024-01-01'
            color: red
            line_start_date: '2023-08-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            data_file:
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2018-09-03'
          x_lim_end: '2025-11-06'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 480
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/bootstrap/double_line_all_data.csv
      hash: md5
      md5: b3e1eff8b33a8f34cd184c207d1a15da
      size: 130108
  wrangle_bootstrap_logistic@2:
    cmd: python -m src.wrangle.bootstrap --fig-name headline --runs-file data/external/all_runs.jsonl
      --output-bootstrap-horizons-file data/wrangled/bootstrap/headline.csv --n-bootstrap
      500
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b075c912c1c344b15ba1424bc37c4a3a
      size: 9706149
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: 5f9f118399ef508188bae970e1f2014c
      size: 10815
    - path: src/wrangle/logistic.py
      hash: md5
      md5: 983ed2e73e116c6dc8f1d465a810bad7
      size: 7053
    params:
      params.yaml:
        plots.plot_logistic_regression.headline:
          parameter_group_name: headline
          trendlines:
          - fit_type: exponential
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2018-09-03'
          x_lim_end: '2025-11-06'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 960
          exclude_agents:
          - GPT-4 0125
          - GPT-4 1106
    outs:
    - path: data/wrangled/bootstrap/headline.csv
      hash: md5
      md5: b3e1eff8b33a8f34cd184c207d1a15da
      size: 130108
  wrangle_bootstrap_logistic_swe_bench:
    cmd: python -m src.wrangle.bootstrap --fig-name swe_bench --runs-file data/processed/swe_bench_runs.jsonl
      --output-bootstrap-horizons-file data/wrangled/bootstrap/swe_bench_runs_bootstrap_swe_bench.csv
      --n-bootstrap 500
    deps:
    - path: data/processed/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: 5f9f118399ef508188bae970e1f2014c
      size: 10815
    - path: src/wrangle/logistic.py
      hash: md5
      md5: 983ed2e73e116c6dc8f1d465a810bad7
      size: 7053
    params:
      params.yaml:
        plots.plot_logistic_regression.swe_bench:
          parameter_group_name: swe_bench
          weighting: invsqrt_task_weight
          regularization: 0.1
    outs:
    - path: data/wrangled/bootstrap/swe_bench_runs_bootstrap_swe_bench.csv
      hash: md5
      md5: 3dbf1b003dbe1cab98fd2d9e8e62cd3e
      size: 55577
  wrangle_logistic_regression@0:
    cmd: python -m src.wrangle.logistic --fig-name single_line_2023_ga_rebench --runs-file
      data/external/all_runs.jsonl --output-logistic-fits-file data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
      --release-dates data/external/release_dates.yaml --bootstrap-file data/wrangled/bootstrap/single_line_2023_ga_rebench.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b075c912c1c344b15ba1424bc37c4a3a
      size: 9706149
    - path: data/external/release_dates.yaml
      hash: md5
      md5: 4a91639d8971362e21161cb104a6589e
      size: 561
    - path: data/wrangled/bootstrap/single_line_2023_ga_rebench.csv
      hash: md5
      md5: 0fb40703aa1a158ed507328198de5298
      size: 111483
    - path: src/wrangle/logistic.py
      hash: md5
      md5: 983ed2e73e116c6dc8f1d465a810bad7
      size: 7053
    params:
      params.yaml:
        plots.plot_logistic_regression.single_line_2023_ga_rebench:
          logistic_file_string: single_line_2023_ga_rebench
          parameter_group_name: single_line_2023_ga_rebench
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2023-01-01'
            color: blue
            line_start_date: '2023-01-01'
            line_end_date: '2025-04-01'
            display_r_squared: true
            data_file:
            styling:
              linewidth: 2
              alpha: 0.5
              linestyle: dashed
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2022-12-01'
          x_lim_end: '2025-04-01'
          lower_y_lim: 0.5
          upper_y_lim: 480
          exclude:
          - SWAA
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
      hash: md5
      md5: 13c722bbac3d61bd8d81f3ba53e478e6
      size: 3249
  wrangle_logistic_regression@1:
    cmd: python -m src.wrangle.logistic --fig-name double_line_all_data --runs-file
      data/external/all_runs.jsonl --output-logistic-fits-file data/wrangled/logistic_fits_double_line_all_data.csv
      --release-dates data/external/release_dates.yaml --bootstrap-file data/wrangled/bootstrap/double_line_all_data.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b075c912c1c344b15ba1424bc37c4a3a
      size: 9706149
    - path: data/external/release_dates.yaml
      hash: md5
      md5: 4a91639d8971362e21161cb104a6589e
      size: 561
    - path: data/wrangled/bootstrap/double_line_all_data.csv
      hash: md5
      md5: b3e1eff8b33a8f34cd184c207d1a15da
      size: 130108
    - path: src/wrangle/logistic.py
      hash: md5
      md5: 983ed2e73e116c6dc8f1d465a810bad7
      size: 7053
    params:
      params.yaml:
        plots.plot_logistic_regression.double_line_all_data:
          logistic_file_string: double_line_all_data
          parameter_group_name: double_line_all_data
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            data_file:
            styling:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2024-01-01'
            color: red
            line_start_date: '2023-08-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            data_file:
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2018-09-03'
          x_lim_end: '2025-11-06'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 480
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/logistic_fits_double_line_all_data.csv
      hash: md5
      md5: 18cf9868a62cc0faec6b6772aec9004f
      size: 3461
  wrangle_logistic_regression@2:
    cmd: python -m src.wrangle.logistic --fig-name headline --runs-file data/external/all_runs.jsonl
      --output-logistic-fits-file data/wrangled/logistic_fits_headline.csv --release-dates
      data/external/release_dates.yaml --bootstrap-file data/wrangled/bootstrap/headline.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b075c912c1c344b15ba1424bc37c4a3a
      size: 9706149
    - path: data/external/release_dates.yaml
      hash: md5
      md5: 4a91639d8971362e21161cb104a6589e
      size: 561
    - path: data/wrangled/bootstrap/headline.csv
      hash: md5
      md5: b3e1eff8b33a8f34cd184c207d1a15da
      size: 130108
    - path: src/wrangle/logistic.py
      hash: md5
      md5: 983ed2e73e116c6dc8f1d465a810bad7
      size: 7053
    params:
      params.yaml:
        plots.plot_logistic_regression.headline:
          parameter_group_name: headline
          trendlines:
          - fit_type: exponential
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2018-09-03'
          x_lim_end: '2025-11-06'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 960
          exclude_agents:
          - GPT-4 0125
          - GPT-4 1106
    outs:
    - path: data/wrangled/logistic_fits_headline.csv
      hash: md5
      md5: 18cf9868a62cc0faec6b6772aec9004f
      size: 3461
  wrangle_logistic_regression_swe_bench:
    cmd: python -m src.wrangle.logistic --fig-name swe_bench --runs-file data/processed/swe_bench_runs.jsonl
      --output-logistic-fits-file data/wrangled/swe_bench_logistic_fits_swe_bench.csv
      --release-dates data/external/release_dates.yaml --bootstrap-file data/wrangled/bootstrap/swe_bench_runs_bootstrap_swe_bench.csv
    deps:
    - path: data/processed/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: data/wrangled/bootstrap/swe_bench_runs_bootstrap_swe_bench.csv
      hash: md5
      md5: 3dbf1b003dbe1cab98fd2d9e8e62cd3e
      size: 55577
    - path: src/wrangle/logistic.py
      hash: md5
      md5: 983ed2e73e116c6dc8f1d465a810bad7
      size: 7053
    params:
      params.yaml:
        plots.plot_logistic_regression.swe_bench:
          parameter_group_name: swe_bench
          weighting: invsqrt_task_weight
          regularization: 0.1
    outs:
    - path: data/wrangled/swe_bench_logistic_fits_swe_bench.csv
      hash: md5
      md5: 1bfdc8a6cb1bbe19812896db5628797b
      size: 1484
  plot_bar_chart_weighted_scores@headline:
    cmd: python -m src.plot.bar_chart_weighted_scores --fig-name horizon_plots:headline
      --metrics-file data/wrangled/logistic_fits/headline.csv --release-dates data/external/release_dates.yaml
      --output-file plots/bar_chart_weighted_scores/headline.png --log-level INFO
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: db301efa7b0ee4e1bc2f424efacc8e1c
      size: 570
    - path: data/wrangled/logistic_fits/headline.csv
      hash: md5
      md5: a93f56cf661066e31a0b5bfc7027b0a8
      size: 3461
    - path: src/plot/bar_chart_weighted_scores.py
      hash: md5
      md5: 827e6b2d7be69b38b6dca5d488539e66
      size: 5193
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
        stages.plot_bar_chart_weighted_scores:
          focus_agents:
          - Claude 3 Opus
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - GPT-4 Turbo
          - GPT-4 0314
          - GPT-4 1106
          - GPT-4o
          - davinci-002 (GPT-3)
          - GPT-2
          - gpt-3.5-turbo-instruct
          - o1-preview
          - o1
          weighting: invsqrt_task_weight
          exclude: []
    outs:
    - path: plots/bar_chart_weighted_scores/headline.png
      hash: md5
      md5: 63c901364681db156b407d9d30581740
      size: 119960
  plot_logistic_regression@single_line_2023_ga_rebench:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/single_line_2023_ga_rebench.png --log-level INFO
      --script-parameter-group single_line_2023_ga_rebench --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
      hash: md5
      md5: c30790c50fd5945bef72badaa3725f21
      size: 3258
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: bd4dc142d9d464770e93c429f37a33f9
      size: 23402
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        figs.plot_logistic_regression.single_line_2023_ga_rebench:
          runs_file: all_runs
          logistic_file: single_line_2023_ga_rebench
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2023-01-01'
            color: blue
            line_start_date: '2023-01-01'
            line_end_date: '2025-04-01'
            display_r_squared: true
            data_file:
            styling:
              linewidth: 2
              alpha: 0.5
              linestyle: dashed
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2022-12-01'
          x_lim_end: '2025-04-01'
          lower_y_lim: 0.5
          upper_y_lim: 120
          exclude:
          - SWAA
          exclude_agents:
          - GPT-4 0125
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/logistic/single_line_2023_ga_rebench.png
      hash: md5
      md5: 169f955ab60bb2df93aedc8a094e5487
      size: 136398
  plot_logistic_regression@single_line_2023_ga_rebench_no_gpt_4_1106:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/single_line_2023_ga_rebench_no_gpt_4_1106.png --log-level
      INFO --script-parameter-group single_line_2023_ga_rebench_no_gpt_4_1106 --params-file
      "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 9568cffc08d138d09c73e634d820d3a5
      size: 7909159
    - path: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
      hash: md5
      md5: 13c722bbac3d61bd8d81f3ba53e478e6
      size: 3249
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: 3a32d389896d25952fac5226ebb4bd3b
      size: 23154
    - path: src/utils/plots.py
      hash: md5
      md5: bf8aec8c9607c4cd1c2ea4f316b3091f
      size: 5360
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002:
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            gpt2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002
          - gpt2
          - Human 8-hour score
          plot_logistic_regression:
            headline:
              parameter_group_name: headline
              trendlines:
              - fit_type: exponential
                caption:
                after_date: '2019-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 960
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench_no_gpt_4_1106:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_retrodict_excluding_swaa:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              include_task_distribution: none
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
            double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              include_task_distribution: none
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude:
              - SWAA
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            double_line_all_data:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            swe_bench:
              parameter_group_name: swe_bench
              weighting: invsqrt_task_weight
              regularization: 0.1
          plot_individual_histograms:
            no_swaa:
              annotate_p50: true
              parameter_group_name: no_swaa
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 1106
              - GPT-4 0314
            no_swaa_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: no_swaa_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
            default:
              annotate_p50: true
              parameter_group_name: default
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
            default_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: default_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
    outs:
    - path: plots/logistic/single_line_2023_ga_rebench_no_gpt_4_1106.png
      hash: md5
      md5: a1a26715d8ac92e6cbd744076c1d7a0b
      size: 140007
  plot_logistic_regression@double_line_all_data:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_fits/headline.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/double_line_all_data.png --log-level INFO --script-parameter-group
      double_line_all_data --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: data/wrangled/logistic_fits/headline.csv
      hash: md5
      md5: a93f56cf661066e31a0b5bfc7027b0a8
      size: 3461
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: bd4dc142d9d464770e93c429f37a33f9
      size: 23402
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        figs.plot_logistic_regression.double_line_all_data:
          runs_file: all_runs
          logistic_file: headline
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2027-01-01'
            display_r_squared: true
            data_file:
            styling:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2024-01-01'
            color: red
            line_start_date: '2023-08-01'
            line_end_date: '2027-01-01'
            display_r_squared: true
            data_file:
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          x_lim_start: '2018-09-03'
          x_lim_end: '2027-01-01'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 11530
          exclude_agents:
          - GPT-4 0125
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/logistic/double_line_all_data.png
      hash: md5
      md5: a65bc36223e4dc7c10839eca599f9c10
      size: 185038
  plot_logistic_regression@double_line_all_data_no_gpt_4_1106:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_fits_double_line_all_data.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/double_line_all_data_no_gpt_4_1106.png --log-level
      INFO --script-parameter-group double_line_all_data_no_gpt_4_1106 --params-file
      "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 9568cffc08d138d09c73e634d820d3a5
      size: 7909159
    - path: data/wrangled/logistic_fits_double_line_all_data.csv
      hash: md5
      md5: 31077b2c98f8b2563698cb92238f0ff0
      size: 3453
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: 3a32d389896d25952fac5226ebb4bd3b
      size: 23154
    - path: src/utils/plots.py
      hash: md5
      md5: bf8aec8c9607c4cd1c2ea4f316b3091f
      size: 5360
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002:
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            gpt2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002
          - gpt2
          - Human 8-hour score
          plot_logistic_regression:
            headline:
              parameter_group_name: headline
              trendlines:
              - fit_type: exponential
                caption:
                after_date: '2019-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 960
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench_no_gpt_4_1106:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_retrodict_excluding_swaa:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              include_task_distribution: none
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
            double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              include_task_distribution: none
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude:
              - SWAA
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            double_line_all_data:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            swe_bench:
              parameter_group_name: swe_bench
              weighting: invsqrt_task_weight
              regularization: 0.1
          plot_individual_histograms:
            no_swaa:
              annotate_p50: true
              parameter_group_name: no_swaa
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 1106
              - GPT-4 0314
            no_swaa_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: no_swaa_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
            default:
              annotate_p50: true
              parameter_group_name: default
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
            default_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: default_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
    outs:
    - path: plots/logistic/double_line_all_data_no_gpt_4_1106.png
      hash: md5
      md5: b6dc4356852991b0702c7eb2a4b87b15
      size: 177059
  plot_logistic_regression@double_line_all_data_retrodict_excluding_swaa:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_fits/double_line_all_data.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/double_line_all_data_retrodict_excluding_swaa.png
      --log-level INFO --script-parameter-group double_line_all_data_retrodict_excluding_swaa
      --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: data/wrangled/logistic_fits/double_line_all_data.csv
      hash: md5
      md5: db637b75ca4cda1406a9b68cbb85513e
      size: 3464
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: bd4dc142d9d464770e93c429f37a33f9
      size: 23402
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        figs.plot_logistic_regression.double_line_all_data_retrodict_excluding_swaa:
          runs_file: all_runs
          logistic_file: double_line_all_data
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          include_task_distribution: none
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption: Actual Fit
            after_date: '2018-01-01'
            color: black
            line_start_date: '2018-12-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            data_file:
            styling:
              linewidth: 2
              alpha: 0.2
              linestyle: solid
          - fit_type: exponential
            skip_annotation: false
            caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
            after_date: '2023-01-01'
            color: blue
            line_start_date: '2023-01-01'
            line_end_date: '2025-07-01'
            display_r_squared: false
            data_file: data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
            styling:
              linewidth: 2
              alpha: 0.6
              linestyle: dashed
          - fit_type: exponential
            skip_annotation: true
            caption: Fuzzy Extension of Retrodiction Fit
            after_date: '2023-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2023-01-01'
            display_r_squared: false
            data_file: data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
            styling:
              linewidth: 4
              alpha: 0.2
              linestyle: dotted
          exclude:
          - SWAA
          exclude_agents:
          - GPT-4 0125
          lower_y_lim: 0.0083333
          upper_y_lim: 240
          x_lim_start: '2018-09-03'
          x_lim_end: '2025-11-06'
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/logistic/double_line_all_data_retrodict_excluding_swaa.png
      hash: md5
      md5: 1e170ffecf42d5b4834633172e894c60
      size: 187437
  plot_logistic_regression@double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106:
    cmd: python -m src.plot.logistic --input-file data/wrangled/logistic_fits_double_line_all_data.csv
      --runs-file data/external/all_runs.jsonl --release-dates data/external/release_dates.yaml
      --output-file plots/logistic/double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106.png
      --log-level INFO --script-parameter-group double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106
      --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 9568cffc08d138d09c73e634d820d3a5
      size: 7909159
    - path: data/wrangled/logistic_fits_double_line_all_data.csv
      hash: md5
      md5: 31077b2c98f8b2563698cb92238f0ff0
      size: 3453
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic.py
      hash: md5
      md5: 3a32d389896d25952fac5226ebb4bd3b
      size: 23154
    - path: src/utils/plots.py
      hash: md5
      md5: bf8aec8c9607c4cd1c2ea4f316b3091f
      size: 5360
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002:
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            gpt2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002
          - gpt2
          - Human 8-hour score
          plot_logistic_regression:
            headline:
              parameter_group_name: headline
              trendlines:
              - fit_type: exponential
                caption:
                after_date: '2019-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 960
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench_no_gpt_4_1106:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_retrodict_excluding_swaa:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              include_task_distribution: none
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
            double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              include_task_distribution: none
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude:
              - SWAA
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            double_line_all_data:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            swe_bench:
              parameter_group_name: swe_bench
              weighting: invsqrt_task_weight
              regularization: 0.1
          plot_individual_histograms:
            no_swaa:
              annotate_p50: true
              parameter_group_name: no_swaa
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 1106
              - GPT-4 0314
            no_swaa_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: no_swaa_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
            default:
              annotate_p50: true
              parameter_group_name: default
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
            default_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: default_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
    outs:
    - path: plots/logistic/double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106.png
      hash: md5
      md5: 27abb22aaf888b1579e83179b126db5d
      size: 185448
  plot_individual_histograms@default:
    cmd: python -m src.plot.individual_histograms --all-runs-file data/external/all_runs.jsonl
      --output-file plots/individual_histograms/default/histograms.png --plot-format
      png --log-level INFO --script-parameter-group default --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: data/wrangled/logistic_fits/double_line_all_data.csv
      hash: md5
      md5: db637b75ca4cda1406a9b68cbb85513e
      size: 3464
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/individual_histograms.py
      hash: md5
      md5: 4986698ec2a710291c6072fbcc243ee3
      size: 12688
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        figs.plot_individual_histograms.default:
          annotate_p50: true
          logistic_file: data/wrangled/logistic_fits/double_line_all_data.csv
          weighting: invsqrt_task_weight
          regularization: 0.1
          categories: ftr
          n_bins: 10
          n_subplot_cols: 3
          horizontal_lines:
          - p_success: 0.5
            styling:
              color: '#b30c00'
              linestyle: dashed
              linewidth: 2.5
              alpha: 1
          x_lim_start: '2022-12-01'
          x_lim_end: '2025-04-01'
          lower_y_lim: 0
          upper_y_lim: 1
          exclude: []
          include_agents:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/individual_histograms/default/histograms.png
      hash: md5
      md5: 058b68b2185eea9820e64719a4d48a52
      size: 534036
  plot_individual_histograms@no_swaa:
    cmd: python -m src.plot.individual_histograms --all-runs-file data/external/all_runs.jsonl
      --output-file plots/individual_histograms/no_swaa/histograms.png --plot-format
      png --log-level INFO --script-parameter-group no_swaa --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
      hash: md5
      md5: c30790c50fd5945bef72badaa3725f21
      size: 3258
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/individual_histograms.py
      hash: md5
      md5: 4986698ec2a710291c6072fbcc243ee3
      size: 12688
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        figs.plot_individual_histograms.no_swaa:
          annotate_p50: true
          logistic_file: data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
          weighting: invsqrt_task_weight
          regularization: 0.1
          categories: ftr
          n_bins: 10
          n_subplot_cols: 3
          horizontal_lines:
          - p_success: 0.5
            styling:
              color: '#b30c00'
              linestyle: dashed
              linewidth: 2.5
              alpha: 1
          x_lim_start: '2022-12-01'
          x_lim_end: '2025-04-01'
          lower_y_lim: 0
          upper_y_lim: 1
          exclude:
          - SWAA
          include_agents:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/individual_histograms/no_swaa/histograms.png
      hash: md5
      md5: 260a5ff76486e96835e3cd5d12c798b7
      size: 418862
  plot_individual_histograms@default_no_gpt_4_1106:
    cmd: python -m src.plot.individual_histograms --all-runs-file data/external/all_runs.jsonl
      --output-file plots/individual_histograms/default_no_gpt_4_1106/histograms.png
      --plot-format png --log-level INFO --script-parameter-group default_no_gpt_4_1106
      --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 9568cffc08d138d09c73e634d820d3a5
      size: 7909159
    - path: data/wrangled/logistic_fits_double_line_all_data.csv
      hash: md5
      md5: 31077b2c98f8b2563698cb92238f0ff0
      size: 3453
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/individual_histograms.py
      hash: md5
      md5: c14ef970a94570b7e69eb2d58f4b4908
      size: 12704
    - path: src/utils/plots.py
      hash: md5
      md5: bf8aec8c9607c4cd1c2ea4f316b3091f
      size: 5360
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002:
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            gpt2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002
          - gpt2
          - Human 8-hour score
          plot_logistic_regression:
            headline:
              parameter_group_name: headline
              trendlines:
              - fit_type: exponential
                caption:
                after_date: '2019-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 960
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench_no_gpt_4_1106:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_retrodict_excluding_swaa:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              include_task_distribution: none
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
            double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              include_task_distribution: none
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude:
              - SWAA
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            double_line_all_data:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            swe_bench:
              parameter_group_name: swe_bench
              weighting: invsqrt_task_weight
              regularization: 0.1
          plot_individual_histograms:
            no_swaa:
              annotate_p50: true
              parameter_group_name: no_swaa
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 1106
              - GPT-4 0314
            no_swaa_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: no_swaa_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
            default:
              annotate_p50: true
              parameter_group_name: default
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
            default_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: default_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
    outs:
    - path: plots/individual_histograms/default_no_gpt_4_1106/histograms.png
      hash: md5
      md5: 670a9278bd917c75c11f6042830d3f9a
      size: 524182
  plot_individual_histograms@no_swaa_no_gpt_4_1106:
    cmd: python -m src.plot.individual_histograms --all-runs-file data/external/all_runs.jsonl
      --output-file plots/individual_histograms/no_swaa_no_gpt_4_1106/histograms.png
      --plot-format png --log-level INFO --script-parameter-group no_swaa_no_gpt_4_1106
      --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 9568cffc08d138d09c73e634d820d3a5
      size: 7909159
    - path: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
      hash: md5
      md5: 13c722bbac3d61bd8d81f3ba53e478e6
      size: 3249
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/individual_histograms.py
      hash: md5
      md5: c14ef970a94570b7e69eb2d58f4b4908
      size: 12704
    - path: src/utils/plots.py
      hash: md5
      md5: bf8aec8c9607c4cd1c2ea4f316b3091f
      size: 5360
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002:
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            gpt2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002
          - gpt2
          - Human 8-hour score
          plot_logistic_regression:
            headline:
              parameter_group_name: headline
              trendlines:
              - fit_type: exponential
                caption:
                after_date: '2019-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 960
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench_no_gpt_4_1106:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_retrodict_excluding_swaa:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              include_task_distribution: none
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
            double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa_no_gpt_4_1106
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              include_task_distribution: none
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude:
              - SWAA
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            double_line_all_data:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_no_gpt_4_1106:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_no_gpt_4_1106
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 1106
              - GPT-4 0125
            swe_bench:
              parameter_group_name: swe_bench
              weighting: invsqrt_task_weight
              regularization: 0.1
          plot_individual_histograms:
            no_swaa:
              annotate_p50: true
              parameter_group_name: no_swaa
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 1106
              - GPT-4 0314
            no_swaa_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: no_swaa_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
            default:
              annotate_p50: true
              parameter_group_name: default
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
            default_no_gpt_4_1106:
              annotate_p50: true
              parameter_group_name: default_no_gpt_4_1106
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
    outs:
    - path: plots/individual_histograms/no_swaa_no_gpt_4_1106/histograms.png
      hash: md5
      md5: 7b98a903c99fe86101f8e1296c430ebb
      size: 427595
  plot_individual_qqs@0:
    cmd: python -m src.plot.individual_histograms --all-runs-file data/external/all_runs.jsonl
      --output-file plots/individual_histograms/default/qq_plots.png --plot-format
      png --log-level INFO --script-parameter-group default --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: 9568cffc08d138d09c73e634d820d3a5
      size: 7909159
    - path: data/wrangled/logistic_fits_double_line_all_data.csv
      hash: md5
      md5: 31077b2c98f8b2563698cb92238f0ff0
      size: 3453
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/individual_histograms.py
      hash: md5
      md5: c14ef970a94570b7e69eb2d58f4b4908
      size: 12704
    - path: src/utils/plots.py
      hash: md5
      md5: bf8aec8c9607c4cd1c2ea4f316b3091f
      size: 5360
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002:
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            gpt2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002
          - gpt2
          - Human 8-hour score
          plot_logistic_regression:
            headline:
              parameter_group_name: headline
              trendlines:
              - fit_type: exponential
                caption:
                after_date: '2019-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 960
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_retrodict_excluding_swaa:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              include_task_distribution: none
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
            double_line_all_data:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 0125
            swe_bench:
              parameter_group_name: swe_bench
              weighting: invsqrt_task_weight
              regularization: 0.1
          plot_individual_histograms:
            no_swaa:
              annotate_p50: true
              parameter_group_name: no_swaa
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 1106
              - GPT-4 0314
            default:
              annotate_p50: true
              parameter_group_name: default
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
    outs:
    - path: plots/individual_histograms/default/qq_plots.png
      hash: md5
      md5: 670a9278bd917c75c11f6042830d3f9a
      size: 524182
  plot_individual_binned_residuals@0:
    cmd: python -m src.plot.individual_binned_residuals --all-runs-file data/external/all_runs.jsonl
      --output-file plots/individual_histograms/default/binned_residuals.png --plot-format
      png --log-level INFO --script-parameter-group default --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b075c912c1c344b15ba1424bc37c4a3a
      size: 9706149
    - path: data/wrangled/logistic_fits_double_line_all_data.csv
      hash: md5
      md5: 18cf9868a62cc0faec6b6772aec9004f
      size: 3461
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/individual_binned_residuals.py
      hash: md5
      md5: deb6b05b0aa78b0890463be0502851b8
      size: 5556
    - path: src/utils/plots.py
      hash: md5
      md5: bf8aec8c9607c4cd1c2ea4f316b3091f
      size: 5360
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002:
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            gpt2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002
          - gpt2
          - Human 8-hour score
          plot_logistic_regression:
            headline:
              parameter_group_name: headline
              trendlines:
              - fit_type: exponential
                caption:
                after_date: '2019-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 960
              exclude_agents:
              - GPT-4 0125
              - GPT-4 1106
            single_line_2023_ga_rebench:
              logistic_file_string: single_line_2023_ga_rebench
              parameter_group_name: single_line_2023_ga_rebench
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-04-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.5
                  linestyle: dashed
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0.5
              upper_y_lim: 480
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
            double_line_all_data_retrodict_excluding_swaa:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data_retrodict_excluding_swaa
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              include_task_distribution: none
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption: Actual Fit
                after_date: '2018-01-01'
                color: black
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
                  linewidth: 2
                  alpha: 0.2
                  linestyle: solid
              - fit_type: exponential
                skip_annotation: false
                caption: "Retrodiction Fit\n(Fit on 2023 onwards GA + RE data ONLY)"
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2023-01-01'
                line_end_date: '2025-07-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 2
                  alpha: 0.6
                  linestyle: dashed
              - fit_type: exponential
                skip_annotation: true
                caption: Fuzzy Extension of Retrodiction Fit
                after_date: '2023-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2023-01-01'
                display_r_squared: false
                data_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
                styling:
                  linewidth: 4
                  alpha: 0.2
                  linestyle: dotted
              exclude:
              - SWAA
              exclude_agents:
              - GPT-4 0125
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
            double_line_all_data:
              logistic_file_string: double_line_all_data
              parameter_group_name: double_line_all_data
              trendlines:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2019-01-01'
                color: blue
                line_start_date: '2018-12-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              - fit_type: exponential
                skip_annotation: false
                caption:
                after_date: '2024-01-01'
                color: red
                line_start_date: '2023-08-01'
                line_end_date: '2025-07-01'
                display_r_squared: true
                data_file:
                styling:
              include_task_distribution: none
              weighting: invsqrt_task_weight
              categories: ftr
              regularization: 0.1
              x_lim_start: '2018-09-03'
              x_lim_end: '2025-11-06'
              exclude: []
              lower_y_lim: 0.0083333
              upper_y_lim: 480
              exclude_agents:
              - GPT-4 0125
            swe_bench:
              parameter_group_name: swe_bench
              weighting: invsqrt_task_weight
              regularization: 0.1
          plot_individual_histograms:
            no_swaa:
              annotate_p50: true
              parameter_group_name: no_swaa
              logistic_file: data/wrangled/logistic_fits_single_line_2023_ga_rebench.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude:
              - SWAA
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 1106
              - GPT-4 0314
            default:
              annotate_p50: true
              parameter_group_name: default
              logistic_file: data/wrangled/logistic_fits_double_line_all_data.csv
              weighting: invsqrt_task_weight
              regularization: 0.1
              categories: ftr
              n_bins: 10
              n_subplot_cols: 3
              horizontal_lines:
              - p_success: 0.5
                styling:
                  color: '#b30c00'
                  linestyle: dashed
                  linewidth: 2.5
                  alpha: 1
              x_lim_start: '2022-12-01'
              x_lim_end: '2025-04-01'
              lower_y_lim: 0
              upper_y_lim: 1
              exclude: []
              include_agents:
              - Claude 3.5 Sonnet (New)
              - Claude 3.5 Sonnet (Old)
              - Claude 3 Opus
              - o1
              - o1-preview
              - GPT-4o
              - GPT-4 Turbo
              - GPT-4 0314
              - gpt-3.5-turbo-instruct
              - davinci-002
              - gpt2
    outs:
    - path: plots/individual_histograms/default/binned_residuals.png
      hash: md5
      md5: 56d9a3cdf593a7ba553e21766615f461
      size: 690219
  plot_logistic_individual@true:
    cmd: python -m src.plot.logistic_individual --input-file data/wrangled/logistic_fits/headline.csv
      --output-file plots/logistic_individual/p50_true.png --log-level INFO --show-p50
      true
    deps:
    - path: data/wrangled/logistic_fits/headline.csv
      hash: md5
      md5: a93f56cf661066e31a0b5bfc7027b0a8
      size: 3461
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic_individual.py
      hash: md5
      md5: 89231571bb238143e41d582cb1cb100c
      size: 4454
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/logistic_individual/p50_true.png
      hash: md5
      md5: 93b3e80461f49ea95d8e6e251273fa3e
      size: 75214
  plot_logistic_individual@false:
    cmd: python -m src.plot.logistic_individual --input-file data/wrangled/logistic_fits/headline.csv
      --output-file plots/logistic_individual/p50_false.png --log-level INFO --show-p50
      false
    deps:
    - path: data/wrangled/logistic_fits/headline.csv
      hash: md5
      md5: a93f56cf661066e31a0b5bfc7027b0a8
      size: 3461
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/logistic_individual.py
      hash: md5
      md5: 89231571bb238143e41d582cb1cb100c
      size: 4454
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/logistic_individual/p50_false.png
      hash: md5
      md5: 30b2cd5379e10937d346dc9dffe2e6cb
      size: 67488
  plot_bootstrap_ci@headline:
    cmd: python -m src.plot.bootstrap_ci --fig-name headline --input-file data/wrangled/bootstrap/headline.csv
      --agent-summaries-file data/wrangled/logistic_fits/headline.csv --release-dates
      data/external/release_dates.yaml --output-file plots/bootstrap/headline.png
      --log-level INFO
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: db301efa7b0ee4e1bc2f424efacc8e1c
      size: 570
    - path: data/wrangled/bootstrap/headline.csv
      hash: md5
      md5: eb7b8bb61aaa1f708be4050500d78306
      size: 130061
    - path: data/wrangled/logistic_fits/headline.csv
      hash: md5
      md5: a93f56cf661066e31a0b5bfc7027b0a8
      size: 3461
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/bootstrap_ci.py
      hash: md5
      md5: b12a5d48f5425d84384f4105bf0d3b23
      size: 7912
    - path: src/plot/logistic.py
      hash: md5
      md5: bd4dc142d9d464770e93c429f37a33f9
      size: 23402
    params:
      params.yaml:
        figs.plot_logistic_regression.headline:
          runs_file: all_runs
          trendlines:
          - fit_type: exponential
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2027-01-01'
            display_r_squared: true
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2018-09-03'
          x_lim_end: '2027-01-01'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 360
          exclude_agents:
          - GPT-4 0125
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/bootstrap/headline.png
      hash: md5
      md5: 0e29f38cfb2dd095effc0f84d41bb693
      size: 218759
  wrangle_bootstrap_logistic@headline:
    cmd: python -m src.wrangle.bootstrap --fig-name headline --runs-file data/external/all_runs.jsonl
      --output-bootstrap-horizons-file data/wrangled/bootstrap/headline.csv --n-bootstrap
      500
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: 52a8663dc98f95603333dc326f991281
      size: 10806
    - path: src/wrangle/logistic.py
      hash: md5
      md5: e8b63048ec10d265da61765e88ebb9ca
      size: 7012
    params:
      params.yaml:
        figs.plot_logistic_regression.headline:
          runs_file: all_runs
          trendlines:
          - fit_type: exponential
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2027-01-01'
            display_r_squared: true
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2018-09-03'
          x_lim_end: '2027-01-01'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 360
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/bootstrap/headline.csv
      hash: md5
      md5: eb7b8bb61aaa1f708be4050500d78306
      size: 130061
  wrangle_bootstrap_logistic@single_line_2023_ga_rebench:
    cmd: python -m src.wrangle.bootstrap --fig-name single_line_2023_ga_rebench --runs-file
      data/external/all_runs.jsonl --output-bootstrap-horizons-file data/wrangled/bootstrap/single_line_2023_ga_rebench.csv
      --n-bootstrap 500
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: 52a8663dc98f95603333dc326f991281
      size: 10806
    - path: src/wrangle/logistic.py
      hash: md5
      md5: e8b63048ec10d265da61765e88ebb9ca
      size: 7012
    params:
      params.yaml:
        figs.plot_logistic_regression.single_line_2023_ga_rebench:
          runs_file: all_runs
          logistic_file: single_line_2023_ga_rebench
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2023-01-01'
            color: blue
            line_start_date: '2023-01-01'
            line_end_date: '2025-04-01'
            display_r_squared: true
            data_file:
            styling:
              linewidth: 2
              alpha: 0.5
              linestyle: dashed
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2022-12-01'
          x_lim_end: '2025-04-01'
          lower_y_lim: 0.5
          upper_y_lim: 120
          exclude:
          - SWAA
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/bootstrap/single_line_2023_ga_rebench.csv
      hash: md5
      md5: 0546bce90709499cf1786dba11558c41
      size: 111428
  wrangle_bootstrap_logistic@double_line_all_data:
    cmd: python -m src.wrangle.bootstrap --fig-name double_line_all_data --runs-file
      data/external/all_runs.jsonl --output-bootstrap-horizons-file data/wrangled/bootstrap/double_line_all_data.csv
      --n-bootstrap 500
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: 52a8663dc98f95603333dc326f991281
      size: 10806
    - path: src/wrangle/logistic.py
      hash: md5
      md5: e8b63048ec10d265da61765e88ebb9ca
      size: 7012
    params:
      params.yaml:
        figs.plot_logistic_regression.double_line_all_data:
          runs_file: all_runs
          logistic_file: headline
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            data_file:
            styling:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2024-01-01'
            color: red
            line_start_date: '2023-08-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            data_file:
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2018-09-03'
          x_lim_end: '2025-11-06'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 240
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/bootstrap/double_line_all_data.csv
      hash: md5
      md5: fc3d4fcbfe17ff93987f14aff070a538
      size: 130102
  wrangle_bootstrap_logistic@swe_bench:
    cmd: python -m src.wrangle.bootstrap --fig-name swe_bench --runs-file data/external/swe_bench_runs.jsonl
      --output-bootstrap-horizons-file data/wrangled/bootstrap/swe_bench.csv --n-bootstrap
      500
    deps:
    - path: data/external/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: src/utils/logistic.py
      hash: md5
      md5: 33bb43781a4954021361ec1e4931aac8
      size: 687
    - path: src/wrangle/bootstrap.py
      hash: md5
      md5: 52a8663dc98f95603333dc326f991281
      size: 10806
    - path: src/wrangle/logistic.py
      hash: md5
      md5: e8b63048ec10d265da61765e88ebb9ca
      size: 7012
    params:
      params.yaml:
        figs.plot_logistic_regression.swe_bench:
          runs_file: swe_bench_runs
          logistic_file: swe_bench
          weighting: invsqrt_task_weight
          regularization: 0.1
    outs:
    - path: data/wrangled/bootstrap/swe_bench.csv
      hash: md5
      md5: 3dbf1b003dbe1cab98fd2d9e8e62cd3e
      size: 55577
  wrangle_logistic_regression@headline:
    cmd: python -m src.wrangle.logistic --fig-name headline --runs-file data/external/all_runs.jsonl
      --output-logistic-fits-file data/wrangled/logistic_fits/headline.csv --release-dates
      data/external/release_dates.yaml --bootstrap-file data/wrangled/bootstrap/headline.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: data/external/release_dates.yaml
      hash: md5
      md5: db301efa7b0ee4e1bc2f424efacc8e1c
      size: 570
    - path: data/wrangled/bootstrap/headline.csv
      hash: md5
      md5: eb7b8bb61aaa1f708be4050500d78306
      size: 130061
    - path: src/wrangle/logistic.py
      hash: md5
      md5: e8b63048ec10d265da61765e88ebb9ca
      size: 7012
    params:
      params.yaml:
        figs.plot_logistic_regression.headline:
          runs_file: all_runs
          trendlines:
          - fit_type: exponential
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2027-01-01'
            display_r_squared: true
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2018-09-03'
          x_lim_end: '2027-01-01'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 360
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/logistic_fits/headline.csv
      hash: md5
      md5: a93f56cf661066e31a0b5bfc7027b0a8
      size: 3461
  wrangle_logistic_regression@single_line_2023_ga_rebench:
    cmd: python -m src.wrangle.logistic --fig-name single_line_2023_ga_rebench --runs-file
      data/external/all_runs.jsonl --output-logistic-fits-file data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
      --release-dates data/external/release_dates.yaml --bootstrap-file data/wrangled/bootstrap/single_line_2023_ga_rebench.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: data/external/release_dates.yaml
      hash: md5
      md5: db301efa7b0ee4e1bc2f424efacc8e1c
      size: 570
    - path: data/wrangled/bootstrap/single_line_2023_ga_rebench.csv
      hash: md5
      md5: 0546bce90709499cf1786dba11558c41
      size: 111428
    - path: src/wrangle/logistic.py
      hash: md5
      md5: e8b63048ec10d265da61765e88ebb9ca
      size: 7012
    params:
      params.yaml:
        figs.plot_logistic_regression.single_line_2023_ga_rebench:
          runs_file: all_runs
          logistic_file: single_line_2023_ga_rebench
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2023-01-01'
            color: blue
            line_start_date: '2023-01-01'
            line_end_date: '2025-04-01'
            display_r_squared: true
            data_file:
            styling:
              linewidth: 2
              alpha: 0.5
              linestyle: dashed
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2022-12-01'
          x_lim_end: '2025-04-01'
          lower_y_lim: 0.5
          upper_y_lim: 120
          exclude:
          - SWAA
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
      hash: md5
      md5: c30790c50fd5945bef72badaa3725f21
      size: 3258
  wrangle_logistic_regression@double_line_all_data:
    cmd: python -m src.wrangle.logistic --fig-name double_line_all_data --runs-file
      data/external/all_runs.jsonl --output-logistic-fits-file data/wrangled/logistic_fits/double_line_all_data.csv
      --release-dates data/external/release_dates.yaml --bootstrap-file data/wrangled/bootstrap/double_line_all_data.csv
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: data/external/release_dates.yaml
      hash: md5
      md5: db301efa7b0ee4e1bc2f424efacc8e1c
      size: 570
    - path: data/wrangled/bootstrap/double_line_all_data.csv
      hash: md5
      md5: fc3d4fcbfe17ff93987f14aff070a538
      size: 130102
    - path: src/wrangle/logistic.py
      hash: md5
      md5: e8b63048ec10d265da61765e88ebb9ca
      size: 7012
    params:
      params.yaml:
        figs.plot_logistic_regression.double_line_all_data:
          runs_file: all_runs
          logistic_file: headline
          trendlines:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2019-01-01'
            color: blue
            line_start_date: '2018-12-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            data_file:
            styling:
          - fit_type: exponential
            skip_annotation: false
            caption:
            after_date: '2024-01-01'
            color: red
            line_start_date: '2023-08-01'
            line_end_date: '2025-07-01'
            display_r_squared: true
            data_file:
            styling:
          include_task_distribution: none
          weighting: invsqrt_task_weight
          categories: ftr
          regularization: 0.1
          x_lim_start: '2018-09-03'
          x_lim_end: '2025-11-06'
          exclude: []
          lower_y_lim: 0.0083333
          upper_y_lim: 240
          exclude_agents:
          - GPT-4 0125
    outs:
    - path: data/wrangled/logistic_fits/double_line_all_data.csv
      hash: md5
      md5: ab3c831034ba243517fd3133ef6430b4
      size: 3468
  wrangle_logistic_regression@swe_bench:
    cmd: python -m src.wrangle.logistic --fig-name swe_bench --runs-file data/external/swe_bench_runs.jsonl
      --output-logistic-fits-file data/wrangled/logistic_fits/swe_bench.csv --release-dates
      data/external/release_dates.yaml --bootstrap-file data/wrangled/bootstrap/swe_bench.csv
    deps:
    - path: data/external/release_dates.yaml
      hash: md5
      md5: db301efa7b0ee4e1bc2f424efacc8e1c
      size: 570
    - path: data/external/swe_bench_runs.jsonl
      hash: md5
      md5: 08e2dc3b47344bc919f7d608fbad1592
      size: 740060
    - path: data/wrangled/bootstrap/swe_bench.csv
      hash: md5
      md5: 3dbf1b003dbe1cab98fd2d9e8e62cd3e
      size: 55577
    - path: src/wrangle/logistic.py
      hash: md5
      md5: e8b63048ec10d265da61765e88ebb9ca
      size: 7012
    params:
      params.yaml:
        figs.plot_logistic_regression.swe_bench:
          runs_file: swe_bench_runs
          logistic_file: swe_bench
          weighting: invsqrt_task_weight
          regularization: 0.1
    outs:
    - path: data/wrangled/logistic_fits/swe_bench.csv
      hash: md5
      md5: 1bfdc8a6cb1bbe19812896db5628797b
      size: 1484
  plot_individual_binned_residuals@default:
    cmd: python -m src.plot.individual_binned_residuals --all-runs-file data/external/all_runs.jsonl
      --output-file plots/individual_histograms/default/binned_residuals.png --plot-format
      png --log-level INFO --script-parameter-group default --params-file "params.yaml"
    deps:
    - path: data/external/all_runs.jsonl
      hash: md5
      md5: b8dca04daab5f4d2bbfdedbb1ce62e5a
      size: 9718647
    - path: data/wrangled/logistic_fits/double_line_all_data.csv
      hash: md5
      md5: db637b75ca4cda1406a9b68cbb85513e
      size: 3464
    - path: matplotlibrc
      hash: md5
      md5: e5c44785adee259a340f12544e2cb856
      size: 526
    - path: src/plot/individual_binned_residuals.py
      hash: md5
      md5: 8c41fba66aed360441334a4524ca4f7d
      size: 5546
    - path: src/utils/plots.py
      hash: md5
      md5: b15dd0e3262c590e9a866cbe38d4695d
      size: 5419
    params:
      params.yaml:
        figs.plot_individual_histograms.default:
          annotate_p50: true
          logistic_file: data/wrangled/logistic_fits/double_line_all_data.csv
          weighting: invsqrt_task_weight
          regularization: 0.1
          categories: ftr
          n_bins: 10
          n_subplot_cols: 3
          horizontal_lines:
          - p_success: 0.5
            styling:
              color: '#b30c00'
              linestyle: dashed
              linewidth: 2.5
              alpha: 1
          x_lim_start: '2022-12-01'
          x_lim_end: '2025-04-01'
          lower_y_lim: 0
          upper_y_lim: 1
          exclude: []
          include_agents:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
        log_level: INFO
        plot_format: png
        plots:
          suptitle_fontsize: 18
          xlabelpad: 10
          ylabelpad: 10
          ax_label_fontsize: 14
          title_fontsize: 16
          task_distribution_styling:
            hist:
              edgecolor: '#a6a6a6'
              color: '#d4d4d4'
              alpha: 1
              linewidth: 1
              zorder: 50
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
          scatter_styling:
            error_bar:
              color: grey
              fmt: none
              capsize: 2
              alpha: 1
              zorder: 9
              linewidth: 1.5
              capthick: 1.5
            grid:
              which: major
              linestyle: '-'
              alpha: 0.2
              color: grey
            scatter:
              s: 150
              edgecolor: black
              linewidth: 0.5
              zorder: 10
          agent_styling:
            Claude 3.5 Sonnet (New):
              lab_color: '#e26e2f'
              marker: s
              unique_color: '#8B4DC9'
            Claude 3.5 Sonnet (Old):
              lab_color: '#e26e2f'
              marker: ^
              unique_color: '#9B6BE0'
            Claude 3 Opus:
              lab_color: '#e26e2f'
              marker: o
              unique_color: '#B594E8'
            o1:
              lab_color: '#3e805f'
              marker: P
              unique_color: '#228B22'
            o1-preview:
              lab_color: '#3e805f'
              marker: X
              unique_color: '#3CB371'
            GPT-4o:
              lab_color: '#3e805f'
              marker: d
              unique_color: '#2B8FB0'
            GPT-4 Turbo:
              lab_color: '#3e805f'
              marker: v
              unique_color: '#4A9CBD'
            GPT-4 1106:
              lab_color: '#3e805f'
              marker: D
              unique_color: '#87CEEB'
            GPT-4 0314:
              lab_color: '#3e805f'
              marker: s
              unique_color: '#87CEEB'
            gpt-3.5-turbo-instruct:
              lab_color: '#3e805f'
              marker: ^
              unique_color: '#CCE6FF'
            davinci-002 (GPT-3):
              lab_color: '#3e805f'
              marker: o
              unique_color: '#B3E0FF'
            GPT-2:
              lab_color: '#3e805f'
              marker: '*'
              unique_color: '#CCE6FF'
            human:
              lab_color: grey
              marker: o
              unique_color: '#858585'
            default:
              lab_color: black
              marker: o
              unique_color: black
          performance_over_time_trendline_styling:
            linear:
              annotation:
                color: red
                fontsize: 10
              line:
                color: red
                alpha: 0.5
                linewidth: 2
            exponential:
              annotation:
                color: blue
                fontsize: 10
              line:
                color: blue
                alpha: 0.5
                linewidth: 2
            hyperbolic:
              annotation:
                color: green
                fontsize: 10
              line:
                color: green
                alpha: 0.5
                linewidth: 2
            default:
              annotation:
                color: black
                fontsize: 10
              line:
                color: black
                alpha: 0.5
                linewidth: 2
          legend_order:
          - Claude 3.5 Sonnet (New)
          - Claude 3.5 Sonnet (Old)
          - Claude 3 Opus
          - o1
          - o1-preview
          - GPT-4o
          - GPT-4 Turbo
          - GPT-4 1106
          - GPT-4 0314
          - gpt-3.5-turbo-instruct
          - davinci-002 (GPT-3)
          - GPT-2
          - Human 8-hour score
    outs:
    - path: plots/individual_histograms/default/binned_residuals.png
      hash: md5
      md5: 37374d88bf2b2289a21a7e0aa9efcbf3
      size: 674920