@@ -914,27 +914,45 @@ def prompt_fn(line: dict, task_name: str = None):
914914 # "claude-3-5-sonnet": "anthropic/claude-3-5-sonnet-20241022",
915915}
916916
917- METRICS_TO_USE = [
917+ LEXICAL_METRICS = [
918918 "bleu" ,
919919 "chrf" ,
920920 "bleu_sentence" ,
921921 "chrf_sentence" ,
922922 "ter_sentence" ,
923923 "meteor" ,
924+ ]
925+ GPU_METRICS = [
924926 "bert_score" ,
925927 "bleurt_large" ,
926928 "xcomet_xxl" ,
929+ ]
930+ API_METRICS = [
927931 "gemba_mqm_gpt_4o" ,
928932 "slt_judge_gpt_4o" ,
929933]
930- METRICS_TO_USE .extend (
931- [
932- f"slt_judge_{ judge_model } -{ system_style } -{ few_shot_style } " .replace ("-" , "_" )
933- for judge_model in JUDGE_MODELS
934- for system_style in ["basic" , "detailed" ]
935- for few_shot_style in ["diverse" , "single" ]
936- ]
937- )
934+ JUDGE_METRICS = [
935+ f"slt_judge_{ judge_model } -{ system_style } -{ few_shot_style } " .replace ("-" , "_" )
936+ for judge_model in JUDGE_MODELS
937+ for system_style in ["basic" , "detailed" ]
938+ for few_shot_style in ["diverse" , "single" ]
939+ ]
940+
941+ metrics_to_evaluate = ["judge" ]
942+
943+ METRICS_TO_USE = []
944+ if metrics_to_evaluate == ["debug" ]:
945+ METRICS_TO_USE = ["bleu" ]
946+ elif "lexical" in metrics_to_evaluate :
947+ METRICS_TO_USE += LEXICAL_METRICS
948+ elif "gpu" in metrics_to_evaluate :
949+ METRICS_TO_USE += GPU_METRICS
950+ elif "api" in metrics_to_evaluate :
951+ METRICS_TO_USE += API_METRICS
952+ elif "judge" in metrics_to_evaluate :
953+ METRICS_TO_USE += JUDGE_METRICS
954+ else :
955+ METRICS_TO_USE = LEXICAL_METRICS + GPU_METRICS + API_METRICS
938956logger .info (f"Available metrics: { METRICS_TO_USE } " )
939957
940958METRICS = {}
0 commit comments