diff --git a/lmms_eval/tasks/air_bench/utils.py b/lmms_eval/tasks/air_bench/utils.py index 32ea79c57..448cc4436 100644 --- a/lmms_eval/tasks/air_bench/utils.py +++ b/lmms_eval/tasks/air_bench/utils.py @@ -132,7 +132,10 @@ def air_bench_process_results_chat(doc, result): eval_answer2, model_name2 = get_eval(max_tokens=1024, content=content) return { - "gpt_eval": {"eval_answer": [eval_answer, eval_answer2], "model_name": model_name}, + "gpt_eval": { + "eval_answer": [eval_answer, eval_answer2], + "model_name": model_name, + }, } @@ -189,7 +192,10 @@ def air_bench_process_results_foundation(doc, result): score = 1.0 if pred == gt_ans else 0.0 submission_dict = {} submission_dict = {doc.get("uniq_id", "unknown"): pred} - return {"accuracy": {"score": score, "task": doc["task_name"]}, "submission": submission_dict} + return { + "accuracy": {"score": score, "task": doc["task_name"]}, + "submission": submission_dict, + } def air_bench_aggregate_results_for_submission(results, args): @@ -211,7 +217,15 @@ def air_bench_aggregate_results_foundation(results): categorical_correct[result["task"]] += result["score"] categorical_total[result["task"]] += 1 - return {"overall_accuracy": score / len(results), "categorical_accuracy": {task: categorical_correct[task] / categorical_total[task] for task in categorical_correct.keys()}} + overall_accuracy = round(score / len(results), 5) + categorical_accuracy = {task: round(categorical_correct[task] / categorical_total[task], 5) for task in categorical_correct.keys()} + + eval_logger.info("=" * 50) + eval_logger.info(f"Overall accuracy: {overall_accuracy}") + for task, acc in categorical_accuracy.items(): + eval_logger.info(f"{task} accuracy: {acc}") + eval_logger.info("=" * 50) + return overall_accuracy def parse_multi_choice_response(response, all_choices): diff --git a/lmms_eval/tasks/vocalsound/utils.py b/lmms_eval/tasks/vocalsound/utils.py index 33f893899..1a363040a 100644 --- a/lmms_eval/tasks/vocalsound/utils.py +++ b/lmms_eval/tasks/vocalsound/utils.py @@ -63,16 +63,21 @@ def vocalsound_aggregate_results(results): group_totals[age_group] += 1 group_correct[age_group] += accuracy - return { - "overall_accuracy": total_correct / len(results), - "categorical_accuracy": { - "male_accuracy": round(group_correct["male"] / group_totals.get("male", 1), 5), # Avoid division by zero - "female_accuracy": round(group_correct["female"] / group_totals.get("female", 1), 5), - "age_18_25_accuracy": round(group_correct["age1"] / group_totals.get("age1", 1), 5), - "age_26_48_accuracy": round(group_correct["age2"] / group_totals.get("age2", 1), 5), - "age_49_80_accuracy": round(group_correct["age3"] / group_totals.get("age3", 1), 5), - }, + overall_accuracy = round(total_correct / len(results), 5) + categorical_accuracy = { + "male_accuracy": round(group_correct["male"] / group_totals.get("male", 1), 5), # Avoid division by zero + "female_accuracy": round(group_correct["female"] / group_totals.get("female", 1), 5), + "age_18_25_accuracy": round(group_correct["age1"] / group_totals.get("age1", 1), 5), + "age_26_48_accuracy": round(group_correct["age2"] / group_totals.get("age2", 1), 5), + "age_49_80_accuracy": round(group_correct["age3"] / group_totals.get("age3", 1), 5), } + eval_logger.info("=" * 50) + eval_logger.info(f"Overall accuracy: {overall_accuracy}") + eval_logger.info("Categorical accuracy: ") + for key, value in categorical_accuracy.items(): + eval_logger.info(f"{key} accuracy: {value}") + eval_logger.info("=" * 50) + return overall_accuracy def get_answer(response):