@@ -914,27 +914,45 @@ def prompt_fn(line: dict, task_name: str = None):
914
914
# "claude-3-5-sonnet": "anthropic/claude-3-5-sonnet-20241022",
915
915
}
916
916
917
- METRICS_TO_USE = [
917
+ LEXICAL_METRICS = [
918
918
"bleu" ,
919
919
"chrf" ,
920
920
"bleu_sentence" ,
921
921
"chrf_sentence" ,
922
922
"ter_sentence" ,
923
923
"meteor" ,
924
+ ]
925
+ GPU_METRICS = [
924
926
"bert_score" ,
925
927
"bleurt_large" ,
926
928
"xcomet_xxl" ,
929
+ ]
930
+ API_METRICS = [
927
931
"gemba_mqm_gpt_4o" ,
928
932
"slt_judge_gpt_4o" ,
929
933
]
930
- METRICS_TO_USE .extend (
931
- [
932
- f"slt_judge_{ judge_model } -{ system_style } -{ few_shot_style } " .replace ("-" , "_" )
933
- for judge_model in JUDGE_MODELS
934
- for system_style in ["basic" , "detailed" ]
935
- for few_shot_style in ["diverse" , "single" ]
936
- ]
937
- )
934
+ JUDGE_METRICS = [
935
+ f"slt_judge_{ judge_model } -{ system_style } -{ few_shot_style } " .replace ("-" , "_" )
936
+ for judge_model in JUDGE_MODELS
937
+ for system_style in ["basic" , "detailed" ]
938
+ for few_shot_style in ["diverse" , "single" ]
939
+ ]
940
+
941
+ metrics_to_evaluate = ["judge" ]
942
+
943
+ METRICS_TO_USE = []
944
+ if metrics_to_evaluate == ["debug" ]:
945
+ METRICS_TO_USE = ["bleu" ]
946
+ elif "lexical" in metrics_to_evaluate :
947
+ METRICS_TO_USE += LEXICAL_METRICS
948
+ elif "gpu" in metrics_to_evaluate :
949
+ METRICS_TO_USE += GPU_METRICS
950
+ elif "api" in metrics_to_evaluate :
951
+ METRICS_TO_USE += API_METRICS
952
+ elif "judge" in metrics_to_evaluate :
953
+ METRICS_TO_USE += JUDGE_METRICS
954
+ else :
955
+ METRICS_TO_USE = LEXICAL_METRICS + GPU_METRICS + API_METRICS
938
956
logger .info (f"Available metrics: { METRICS_TO_USE } " )
939
957
940
958
METRICS = {}
0 commit comments