Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

模型结果测评复现 #30

Open
JustQJ opened this issue Mar 7, 2024 · 1 comment
Open

模型结果测评复现 #30

JustQJ opened this issue Mar 7, 2024 · 1 comment

Comments

@JustQJ
Copy link

JustQJ commented Mar 7, 2024

请问论文中的模型评测是如何评测的,我使用lm-eval库DeepSeek-MoE-16b-chat模型进行测评,测试了DROP和GSM8K两个数据集,但是得到的结果和论文相差很大,请问可能是什么原因。下面是测试命令和结果,在2张40G A100上测试。谢谢!
DROP测试命令:

lm_eval --model hf \  
    --model_args pretrained=deepseek-ai/deepseek-moe-16b-chat,dtype="bfloat16",parallelize=True \  
    --tasks drop \
    --batch_size 8 \
    --num_fewshot 1 \
    --output_path deepseek-moe-16b-chat-drop.json \
    --trust_remote_code

DROP测试结果:

{
  "results": {
    "drop": {
      "em,none": 0.011954697986577181,
      "em_stderr,none": 0.001113005689885913,
      "f1,none": 0.044237625838926285,
      "f1_stderr,none": 0.0014818249593304353,
      "alias": "drop"
    }
  },
  "group_subtasks": {
    "drop": []
  },
  "configs": {
    "drop": {
      "task": "drop",
      "dataset_path": "benchmark_datasets/drop",
      "dataset_kwargs": {
        "trust_remote_code": true
      },
      "training_split": "train",
      "validation_split": "validation",
      "process_docs": "def process_docs(dataset):\n    def _process(doc):\n        return {\n            \"id\": doc[\"query_id\"],\n            \"passage\": doc[\"passage\"],\n            \"question\": doc[\"question\"],\n            \"answers\": get_answers(doc),\n        }\n\n    return dataset.map(_process)\n",
      "doc_to_text": "{{passage}} {{question}}",
      "doc_to_target": "{{ answer|join(',')}}",
      "process_results": "def process_results(doc, results):\n    preds, golds = results, doc[\"answers\"]\n    max_em = 0\n    max_f1 = 0\n    for gold_answer in golds:\n        exact_match, f1_score = get_metrics(preds, gold_answer)\n        if gold_answer[0].strip():\n            max_em = max(max_em, exact_match)\n            max_f1 = max(max_f1, f1_score)\n    return {\"em\": max_em, \"f1\": max_f1}\n",
      "description": "",
      "target_delimiter": "",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 1,
      "metric_list": [
        {
          "metric": "em",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "f1",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "until": [
          "."
        ]
      },
      "repeats": 1,
      "should_decontaminate": true,
      "doc_to_decontamination_query": "{{passage}} {{question}}",
      "metadata": {
        "version": 3.0
      }
    }
  },
  "versions": {
    "drop": 3.0
  },
  "n-shot": {
    "drop": 1
  },
  "config": {
    "model": "hf",
    "model_args": "pretrained=deepseek-ai/deepseek-moe-16b-chat,dtype=bfloat16,parallelize=True,trust_remote_code=True",
    "batch_size": "8",
    "batch_sizes": [],
    "device": null,
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": null
  },
  "git_hash": "2dafddf",
  "transformers_version": "4.38.1",
  "upper_git_hash": null
}

GSM8K测试命令:

lm_eval --model hf \  
    --model_args pretrained=deepseek-ai/deepseek-moe-16b-chat,dtype="bfloat16",parallelize=True \  
    --tasks gsm8k \
    --batch_size 8 \
    --num_fewshot 0 \
    --output_path deepseek-moe-16b-chat-gsm8k.json \
    --trust_remote_code

GSM8K测试结果:

{
  "results": {
    "gsm8k": {
      "exact_match,strict-match": 0.0,
      "exact_match_stderr,strict-match": 0.0,
      "exact_match,flexible-extract": 0.33358605003790753,
      "exact_match_stderr,flexible-extract": 0.012987282131410809,
      "alias": "gsm8k"
    }
  },
  "group_subtasks": {
    "gsm8k": []
  },
  "configs": {
    "gsm8k": {
      "task": "gsm8k",
      "group": [
        "math_word_problems"
      ],
      "dataset_path": "benchmark_datasets/gsm8k",
      "dataset_name": "main",
      "training_split": "train",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Question: {{question}}\nAnswer:",
      "doc_to_target": "{{answer}}",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "exact_match",
          "aggregation": "mean",
          "higher_is_better": true,
          "ignore_case": true,
          "ignore_punctuation": false,
          "regexes_to_ignore": [
            ",",
            "\\$",
            "(?s).*#### ",
            "\\.$"
          ]
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "until": [
          "Question:",
          "</s>",
          "<|im_end|>"
        ],
        "do_sample": false,
        "temperature": 0.0
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "strict-match",
          "filter": [
            {
              "function": "regex",
              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
            },
            {
              "function": "take_first"
            }
          ]
        },
        {
          "name": "flexible-extract",
          "filter": [
            {
              "function": "regex",
              "group_select": -1,
              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 3.0
      }
    }
  },
  "versions": {
    "gsm8k": 3.0
  },
  "n-shot": {
    "gsm8k": 0
  },
  "config": {
    "model": "hf",
    "model_args": "pretrained=deepseek-ai/deepseek-moe-16b-chat,dtype=bfloat16,parallelize=True,trust_remote_code=True",
    "batch_size": "8",
    "batch_sizes": [],
    "device": null,
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": null
  },
  "git_hash": "2c017c1",
  "transformers_version": "4.38.1",
  "upper_git_hash": null
}
@zwd003
Copy link
Collaborator

zwd003 commented Apr 3, 2024

可以看看输出日志,确保输出正常

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants