模型结果测评复现 #30

JustQJ · 2024-03-07T14:40:47Z

请问论文中的模型评测是如何评测的，我使用lm-eval库对DeepSeek-MoE-16b-chat模型进行测评，测试了DROP和GSM8K两个数据集，但是得到的结果和论文相差很大，请问可能是什么原因。下面是测试命令和结果，在2张40G A100上测试。谢谢！
DROP测试命令：

lm_eval --model hf \  
    --model_args pretrained=deepseek-ai/deepseek-moe-16b-chat,dtype="bfloat16",parallelize=True \  
    --tasks drop \
    --batch_size 8 \
    --num_fewshot 1 \
    --output_path deepseek-moe-16b-chat-drop.json \
    --trust_remote_code

DROP测试结果：

{
  "results": {
    "drop": {
      "em,none": 0.011954697986577181,
      "em_stderr,none": 0.001113005689885913,
      "f1,none": 0.044237625838926285,
      "f1_stderr,none": 0.0014818249593304353,
      "alias": "drop"
    }
  },
  "group_subtasks": {
    "drop": []
  },
  "configs": {
    "drop": {
      "task": "drop",
      "dataset_path": "benchmark_datasets/drop",
      "dataset_kwargs": {
        "trust_remote_code": true
      },
      "training_split": "train",
      "validation_split": "validation",
      "process_docs": "def process_docs(dataset):\n    def _process(doc):\n        return {\n            \"id\": doc[\"query_id\"],\n            \"passage\": doc[\"passage\"],\n            \"question\": doc[\"question\"],\n            \"answers\": get_answers(doc),\n        }\n\n    return dataset.map(_process)\n",
      "doc_to_text": "{{passage}} {{question}}",
      "doc_to_target": "{{ answer|join(',')}}",
      "process_results": "def process_results(doc, results):\n    preds, golds = results, doc[\"answers\"]\n    max_em = 0\n    max_f1 = 0\n    for gold_answer in golds:\n        exact_match, f1_score = get_metrics(preds, gold_answer)\n        if gold_answer[0].strip():\n            max_em = max(max_em, exact_match)\n            max_f1 = max(max_f1, f1_score)\n    return {\"em\": max_em, \"f1\": max_f1}\n",
      "description": "",
      "target_delimiter": "",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 1,
      "metric_list": [
        {
          "metric": "em",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "f1",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "until": [
          "."
        ]
      },
      "repeats": 1,
      "should_decontaminate": true,
      "doc_to_decontamination_query": "{{passage}} {{question}}",
      "metadata": {
        "version": 3.0
      }
    }
  },
  "versions": {
    "drop": 3.0
  },
  "n-shot": {
    "drop": 1
  },
  "config": {
    "model": "hf",
    "model_args": "pretrained=deepseek-ai/deepseek-moe-16b-chat,dtype=bfloat16,parallelize=True,trust_remote_code=True",
    "batch_size": "8",
    "batch_sizes": [],
    "device": null,
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": null
  },
  "git_hash": "2dafddf",
  "transformers_version": "4.38.1",
  "upper_git_hash": null
}

GSM8K测试命令：

lm_eval --model hf \  
    --model_args pretrained=deepseek-ai/deepseek-moe-16b-chat,dtype="bfloat16",parallelize=True \  
    --tasks gsm8k \
    --batch_size 8 \
    --num_fewshot 0 \
    --output_path deepseek-moe-16b-chat-gsm8k.json \
    --trust_remote_code

GSM8K测试结果：

{
  "results": {
    "gsm8k": {
      "exact_match,strict-match": 0.0,
      "exact_match_stderr,strict-match": 0.0,
      "exact_match,flexible-extract": 0.33358605003790753,
      "exact_match_stderr,flexible-extract": 0.012987282131410809,
      "alias": "gsm8k"
    }
  },
  "group_subtasks": {
    "gsm8k": []
  },
  "configs": {
    "gsm8k": {
      "task": "gsm8k",
      "group": [
        "math_word_problems"
      ],
      "dataset_path": "benchmark_datasets/gsm8k",
      "dataset_name": "main",
      "training_split": "train",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Question: {{question}}\nAnswer:",
      "doc_to_target": "{{answer}}",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "exact_match",
          "aggregation": "mean",
          "higher_is_better": true,
          "ignore_case": true,
          "ignore_punctuation": false,
          "regexes_to_ignore": [
            ",",
            "\\$",
            "(?s).*#### ",
            "\\.$"
          ]
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "until": [
          "Question:",
          "</s>",
          "<|im_end|>"
        ],
        "do_sample": false,
        "temperature": 0.0
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "strict-match",
          "filter": [
            {
              "function": "regex",
              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
            },
            {
              "function": "take_first"
            }
          ]
        },
        {
          "name": "flexible-extract",
          "filter": [
            {
              "function": "regex",
              "group_select": -1,
              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 3.0
      }
    }
  },
  "versions": {
    "gsm8k": 3.0
  },
  "n-shot": {
    "gsm8k": 0
  },
  "config": {
    "model": "hf",
    "model_args": "pretrained=deepseek-ai/deepseek-moe-16b-chat,dtype=bfloat16,parallelize=True,trust_remote_code=True",
    "batch_size": "8",
    "batch_sizes": [],
    "device": null,
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": null
  },
  "git_hash": "2c017c1",
  "transformers_version": "4.38.1",
  "upper_git_hash": null
}

The text was updated successfully, but these errors were encountered:

zwd003 · 2024-04-03T13:58:37Z

可以看看输出日志，确保输出正常

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

模型结果测评复现 #30

模型结果测评复现 #30

JustQJ commented Mar 7, 2024

zwd003 commented Apr 3, 2024

模型结果测评复现 #30

模型结果测评复现 #30

Comments

JustQJ commented Mar 7, 2024

zwd003 commented Apr 3, 2024