params.yaml

# This file is automatically generated by `compose_report_params overtime`.
# Do not edit it directly.

log_level: INFO
n_bootstrap: 1000
plot_format: png
weighting:
- weight_col: equal_task_weight
  graph_snippet: Equally weighted tasks
- weight_col: invsqrt_task_weight
  graph_snippet: Tasks diversity-weighted (1/sqrt(# tasks in family))
- weight_col: null
  graph_snippet: None
stages:
  filter_aird_runs:
    task_families:
    - ai_rd_fix_embedding
    - ai_rd_nanogpt_chat_rl
    - ai_rd_optimize_llm_foundry
    - ai_rd_restricted_mlm
    - ai_rd_rust_codecontests_inference
    - ai_rd_small_scaling_law
    - ai_rd_triton_cumsum
  wrangle_bar_by_time_allocation:
    time_limits:
    - 36000
  plot_bar_chart_weighted_scores:
    focus_agents:
    - Claude 3 Opus
    - Claude 3.5 Sonnet (New)
    - Claude 3.5 Sonnet (Old)
    - GPT-4 Turbo
    - GPT-4 0314
    - GPT-4 1106
    - GPT-4o
    - davinci-002 (GPT-3)
    - GPT-2
    - gpt-3.5-turbo-instruct
    - o1-preview
    - o1
    weighting: invsqrt_task_weight
    exclude: []
rebench_best_of_k_parameters:
  time_limits:
  - 36000
  max_time_limit_in_seconds: 36000
plots:
  suptitle_fontsize: 18
  xlabelpad: 10
  ylabelpad: 10
  ax_label_fontsize: 14
  title_fontsize: 16
  task_distribution_styling:
    hist:
      edgecolor: '#a6a6a6'
      color: '#d4d4d4'
      alpha: 1
      linewidth: 1
      zorder: 50
    grid:
      which: major
      linestyle: '-'
      alpha: 0.2
      color: grey
  scatter_styling:
    error_bar:
      color: grey
      fmt: none
      capsize: 2
      alpha: 1
      zorder: 9
      linewidth: 1.5
      capthick: 1.5
    grid:
      which: major
      linestyle: '-'
      alpha: 0.2
      color: grey
    scatter:
      s: 150
      edgecolor: black
      linewidth: 0.5
      zorder: 10
  agent_styling:
    Claude 3.5 Sonnet (New):
      lab_color: '#e26e2f'
      marker: s
      unique_color: '#8B4DC9'
    Claude 3.5 Sonnet (Old):
      lab_color: '#e26e2f'
      marker: ^
      unique_color: '#9B6BE0'
    Claude 3 Opus:
      lab_color: '#e26e2f'
      marker: o
      unique_color: '#B594E8'
    o1:
      lab_color: '#3e805f'
      marker: P
      unique_color: '#228B22'
    o1-preview:
      lab_color: '#3e805f'
      marker: X
      unique_color: '#3CB371'
    GPT-4o:
      lab_color: '#3e805f'
      marker: d
      unique_color: '#2B8FB0'
    GPT-4 Turbo:
      lab_color: '#3e805f'
      marker: v
      unique_color: '#4A9CBD'
    GPT-4 1106:
      lab_color: '#3e805f'
      marker: D
      unique_color: '#87CEEB'
    GPT-4 0314:
      lab_color: '#3e805f'
      marker: s
      unique_color: '#87CEEB'
    gpt-3.5-turbo-instruct:
      lab_color: '#3e805f'
      marker: ^
      unique_color: '#CCE6FF'
    davinci-002 (GPT-3):
      lab_color: '#3e805f'
      marker: o
      unique_color: '#B3E0FF'
    GPT-2:
      lab_color: '#3e805f'
      marker: '*'
      unique_color: '#CCE6FF'
    human:
      lab_color: grey
      marker: o
      unique_color: '#858585'
    default:
      lab_color: black
      marker: o
      unique_color: black
  performance_over_time_trendline_styling:
    linear:
      annotation:
        color: red
        fontsize: 10
      line:
        color: red
        alpha: 0.5
        linewidth: 2
    exponential:
      annotation:
        color: blue
        fontsize: 10
      line:
        color: blue
        alpha: 0.5
        linewidth: 2
    hyperbolic:
      annotation:
        color: green
        fontsize: 10
      line:
        color: green
        alpha: 0.5
        linewidth: 2
    default:
      annotation:
        color: black
        fontsize: 10
      line:
        color: black
        alpha: 0.5
        linewidth: 2
  legend_order:
  - Claude 3.5 Sonnet (New)
  - Claude 3.5 Sonnet (Old)
  - Claude 3 Opus
  - o1
  - o1-preview
  - GPT-4o
  - GPT-4 Turbo
  - GPT-4 1106
  - GPT-4 0314
  - gpt-3.5-turbo-instruct
  - davinci-002 (GPT-3)
  - GPT-2
  - Human 8-hour score
report:
  name: overtime
figs:
  plot_logistic_regression:
    headline:
      runs_file: all_runs
      trendlines:
      - fit_type: exponential
        caption: null
        after_date: '2019-01-01'
        color: blue
        line_start_date: '2018-12-01'
        line_end_date: '2027-01-01'
        display_r_squared: true
        styling: null
      include_task_distribution: none
      weighting: invsqrt_task_weight
      categories: ftr
      regularization: 0.1
      x_lim_start: '2018-09-03'
      x_lim_end: '2027-01-01'
      exclude: []
      lower_y_lim: 0.0083333
      upper_y_lim: 360
      exclude_agents:
      - GPT-4 0125
    single_line_2023_ga_rebench:
      runs_file: all_runs
      logistic_file: single_line_2023_ga_rebench
      trendlines:
      - fit_type: exponential
        skip_annotation: false
        caption: null
        after_date: '2023-01-01'
        color: blue
        line_start_date: '2023-01-01'
        line_end_date: '2025-04-01'
        display_r_squared: true
        data_file: null
        styling:
          linewidth: 2
          alpha: 0.5
          linestyle: dashed
      include_task_distribution: none
      weighting: invsqrt_task_weight
      categories: ftr
      regularization: 0.1
      x_lim_start: '2022-12-01'
      x_lim_end: '2025-04-01'
      lower_y_lim: 0.5
      upper_y_lim: 120
      exclude:
      - SWAA
      exclude_agents:
      - GPT-4 0125
    double_line_all_data_retrodict_excluding_swaa:
      runs_file: all_runs
      logistic_file: double_line_all_data
      weighting: invsqrt_task_weight
      categories: ftr
      regularization: 0.1
      include_task_distribution: none
      trendlines:
      - fit_type: exponential
        skip_annotation: false
        caption: Actual Fit
        after_date: '2018-01-01'
        color: black
        line_start_date: '2018-12-01'
        line_end_date: '2025-07-01'
        display_r_squared: true
        data_file: null
        styling:
          linewidth: 2
          alpha: 0.2
          linestyle: solid
      - fit_type: exponential
        skip_annotation: false
        caption: 'Retrodiction Fit

          (Fit on 2023 onwards GA + RE data ONLY)'
        after_date: '2023-01-01'
        color: blue
        line_start_date: '2023-01-01'
        line_end_date: '2025-07-01'
        display_r_squared: false
        data_file: data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
        styling:
          linewidth: 2
          alpha: 0.6
          linestyle: dashed
      - fit_type: exponential
        skip_annotation: true
        caption: Fuzzy Extension of Retrodiction Fit
        after_date: '2023-01-01'
        color: blue
        line_start_date: '2018-12-01'
        line_end_date: '2023-01-01'
        display_r_squared: false
        data_file: data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
        styling:
          linewidth: 4
          alpha: 0.2
          linestyle: dotted
      exclude:
      - SWAA
      exclude_agents:
      - GPT-4 0125
      lower_y_lim: 0.0083333
      upper_y_lim: 240
      x_lim_start: '2018-09-03'
      x_lim_end: '2025-11-06'
    double_line_all_data:
      runs_file: all_runs
      logistic_file: headline
      trendlines:
      - fit_type: exponential
        skip_annotation: false
        caption: null
        after_date: '2019-01-01'
        color: blue
        line_start_date: '2018-12-01'
        line_end_date: '2027-01-01'
        display_r_squared: true
        data_file: null
        styling: null
      - fit_type: exponential
        skip_annotation: false
        caption: null
        after_date: '2024-01-01'
        color: red
        line_start_date: '2023-08-01'
        line_end_date: '2027-01-01'
        display_r_squared: true
        data_file: null
        styling: null
      include_task_distribution: none
      weighting: invsqrt_task_weight
      x_lim_start: '2018-09-03'
      x_lim_end: '2027-01-01'
      exclude: []
      lower_y_lim: 0.0083333
      upper_y_lim: 11530
      exclude_agents:
      - GPT-4 0125
    swe_bench:
      runs_file: swe_bench_runs
      logistic_file: swe_bench
      weighting: invsqrt_task_weight
      regularization: 0.1
  plot_individual_histograms:
    no_swaa:
      annotate_p50: true
      logistic_file: data/wrangled/logistic_fits/single_line_2023_ga_rebench.csv
      weighting: invsqrt_task_weight
      regularization: 0.1
      categories: ftr
      n_bins: 10
      n_subplot_cols: 3
      horizontal_lines:
      - p_success: 0.5
        styling:
          color: '#b30c00'
          linestyle: dashed
          linewidth: 2.5
          alpha: 1
      x_lim_start: '2022-12-01'
      x_lim_end: '2025-04-01'
      lower_y_lim: 0
      upper_y_lim: 1
      exclude:
      - SWAA
      include_agents:
      - Claude 3.5 Sonnet (New)
      - Claude 3.5 Sonnet (Old)
      - Claude 3 Opus
      - o1
      - o1-preview
      - GPT-4o
      - GPT-4 Turbo
      - GPT-4 1106
      - GPT-4 0314
    default:
      annotate_p50: true
      logistic_file: data/wrangled/logistic_fits/double_line_all_data.csv
      weighting: invsqrt_task_weight
      regularization: 0.1
      categories: ftr
      n_bins: 10
      n_subplot_cols: 3
      horizontal_lines:
      - p_success: 0.5
        styling:
          color: '#b30c00'
          linestyle: dashed
          linewidth: 2.5
          alpha: 1
      x_lim_start: '2022-12-01'
      x_lim_end: '2025-04-01'
      lower_y_lim: 0
      upper_y_lim: 1
      exclude: []
      include_agents:
      - Claude 3.5 Sonnet (New)
      - Claude 3.5 Sonnet (Old)
      - Claude 3 Opus
      - o1
      - o1-preview
      - GPT-4o
      - GPT-4 Turbo
      - GPT-4 0314
      - gpt-3.5-turbo-instruct
      - davinci-002 (GPT-3)
      - GPT-2
  plot_logistic_multiverse:
    exclude_agents:
    - GPT-4 0125
    weightings:
    - equal_task_weight
    - invsqrt_task_weight
    regularizations:
    - 0.1
    - 0.05
    - 0.02
    - 0.01
    categories: ftr