Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Logging Revamp #284

Merged
merged 20 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ jobs:
path: "cache"
key: test-cache-HF
- name: Test
env:
HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models"
python -m pytest --disable-pytest-warnings
- name: Write cache
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ dependencies = [
"sentencepiece>=0.1.99",
"protobuf==3.20.*", # pinned for sentencepiece compat
"pycountry",
"fsspec>=2023.12.2",
]

[project.optional-dependencies]
Expand All @@ -95,6 +96,7 @@ extended_tasks = [
"langdetect", # ifeval
"openai", # llm as a judge using openai models
]
s3 = ["s3fs"]

[project.urls]
Homepage = "https://github.com/huggingface/lighteval"
Expand Down
5 changes: 2 additions & 3 deletions src/lighteval/config/lighteval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,8 @@ class LightEvalLoggingArgs:

output_dir: str
save_details: bool = True
push_results_to_hub: bool = False
push_details_to_hub: bool = False
push_results_to_tensorboard: bool = False
push_to_hub: bool = False
push_to_tensorboard: bool = False
public_run: bool = False
results_org: str | None = None
tensorboard_metric_prefix: str = "eval"
Expand Down
213 changes: 97 additions & 116 deletions src/lighteval/logging/evaluation_tracker.py

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions src/lighteval/main_accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,11 @@ def main(args):
env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
evaluation_tracker = EvaluationTracker(
output_dir=args.output_dir,
hub_results_org=args.results_org,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you remove this one ? There is a check at init that fails if this is not set while push_to_hub is set

Copy link
Collaborator Author

@hynky1999 hynky1999 Sep 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't remove it, I changed the ordering for some reason (during dev). We can move it up for sure.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mhh I added it back a minute ago, I could'nt see it before

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, actually indeed you are right, I was looking at the most recent diff and thought I changed that.
Should have done the tests properly so that this is catched (https://github.com/huggingface/lighteval/pull/284/files#diff-97ad1010a3f9d0568387464594c6396718637ed66d355d6b3280aa1f72607ba1R121). Will change it!

push_results_to_hub=args.push_results_to_hub,
push_details_to_hub=args.push_details_to_hub,
push_results_to_tensorboard=args.push_results_to_tensorboard,
save_details=args.save_details,
push_to_hub=args.push_to_hub,
push_to_tensorboard=args.push_to_tensorboard,
public=args.public_run,
token=TOKEN,
hub_results_org=args.results_org,
)
pipeline_params = PipelineParameters(
launcher_type=ParallelismManager.ACCELERATE,
Expand Down
4 changes: 4 additions & 0 deletions src/lighteval/main_nanotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ def main(
evaluation_tracker = EvaluationTracker(
output_dir=lighteval_config.logging.output_dir,
hub_results_org=lighteval_config.logging.results_org,
public=lighteval_config.logging.public_run,
push_to_hub=lighteval_config.logging.push_to_hub,
push_to_tensorboard=lighteval_config.logging.push_to_tensorboard,
save_details=lighteval_config.logging.save_details,
tensorboard_metric_prefix=lighteval_config.logging.tensorboard_metric_prefix,
nanotron_run_info=nanotron_config.nanotron_config.general,
)
Expand Down
12 changes: 6 additions & 6 deletions src/lighteval/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ def parser_accelerate(parser=None):
parser.add_argument("--job_id", type=str, help="Optional Job ID for future reference", default="")

# Saving
parser.add_argument("--output_dir", required=True, type=str, help="Directory to save the results")
parser.add_argument(
"--push_results_to_hub", default=False, action="store_true", help="Set to push the results to the hub"
"--output_dir",
required=True,
type=str,
help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)",
)
parser.add_argument("--save_details", action="store_true", help="Save the details of the run in the output_dir")
parser.add_argument(
"--push_details_to_hub", default=False, action="store_true", help="Set to push the details to the hub"
)
parser.add_argument("--push_results_to_tensorboard", default=False, action="store_true")
parser.add_argument("--push_to_hub", default=False, action="store_true", help="Set to push the details to the hub")
parser.add_argument("--push_to_tensorboard", default=False, action="store_true")
parser.add_argument(
"--public_run", default=False, action="store_true", help="Push results and details to a public repo"
)
Expand Down
60 changes: 60 additions & 0 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import os

import pytest
from huggingface_hub import HfApi
from huggingface_hub.hf_api import DatasetInfo


TESTING_EMPTY_HF_ORG_ID = "lighteval-tests"


@pytest.fixture
def testing_empty_hf_org_id(org_id: str = TESTING_EMPTY_HF_ORG_ID):
old_token = os.getenv("HF_TOKEN")
os.environ["HF_TOKEN"] = os.getenv("HF_TEST_TOKEN")

def list_repos(org_id: str):
return list(hf_api.list_models(author=org_id)) + list(hf_api.list_datasets(author=org_id))

def clean_repos(org_id: str):
repos = list_repos(org_id)
for repo in repos:
hf_api.delete_repo(repo.id, repo_type="dataset" if isinstance(repo, DatasetInfo) else "model")

hf_api = HfApi()
# Remove all repositories in the HF org
clean_repos(org_id)

# Verify that all repositories have been removed
remaining_repos = list_repos(org_id)
assert len(remaining_repos) == 0, f"Expected 0 repositories, but found {len(remaining_repos)}"

yield org_id

# Clean up: recreate any necessary default repositories after the test
# This step is optional and depends on your specific needs
clean_repos(org_id)
os.environ["HF_TOKEN"] = old_token
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
165 changes: 165 additions & 0 deletions tests/logging/test_evaluation_tracker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import json
import os
import tempfile
from datetime import datetime
from pathlib import Path

import pytest
from datasets import Dataset
from huggingface_hub import HfApi

from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.logging.info_loggers import DetailsLogger

# ruff: noqa
from tests.fixtures import TESTING_EMPTY_HF_ORG_ID, testing_empty_hf_org_id


@pytest.fixture
def mock_evaluation_tracker(request):
passed_params = {}
if request.keywords.get("evaluation_tracker"):
passed_params = request.keywords["evaluation_tracker"].kwargs

with tempfile.TemporaryDirectory() as temp_dir:
kwargs = {
"output_dir": temp_dir,
"save_details": passed_params.get("save_details", False),
"push_to_hub": passed_params.get("push_to_hub", False),
"push_to_tensorboard": passed_params.get("push_to_tensorboard", False),
"hub_results_org": passed_params.get("hub_results_org", ""),
}
tracker = EvaluationTracker(**kwargs)
tracker.general_config_logger.model_name = "test_model"
yield tracker


@pytest.fixture
def mock_datetime(monkeypatch):
mock_date = datetime(2023, 1, 1, 12, 0, 0)

class MockDatetime:
@classmethod
def now(cls):
return mock_date

@classmethod
def fromisoformat(cls, date_string: str):
return mock_date

monkeypatch.setattr("lighteval.logging.evaluation_tracker.datetime", MockDatetime)
return mock_date


def test_results_logging(mock_evaluation_tracker: EvaluationTracker):
task_metrics = {
"task1": {"accuracy": 0.8, "f1": 0.75},
"task2": {"precision": 0.9, "recall": 0.85},
}
mock_evaluation_tracker.metrics_logger.metric_aggregated = task_metrics

mock_evaluation_tracker.save()

results_dir = Path(mock_evaluation_tracker.output_dir) / "results" / "test_model"
assert results_dir.exists()

result_files = list(results_dir.glob("results_*.json"))
assert len(result_files) == 1

with open(result_files[0], "r") as f:
saved_results = json.load(f)

assert "results" in saved_results
assert saved_results["results"] == task_metrics
assert saved_results["config_general"]["model_name"] == "test_model"


@pytest.mark.evaluation_tracker(save_details=True)
def test_details_logging(mock_evaluation_tracker, mock_datetime):
task_details = {
"task1": [DetailsLogger.CompiledDetail(truncated=10, padded=5)],
"task2": [DetailsLogger.CompiledDetail(truncated=20, padded=10)],
}
mock_evaluation_tracker.details_logger.details = task_details

mock_evaluation_tracker.save()

date_id = mock_datetime.isoformat().replace(":", "-")
details_dir = Path(mock_evaluation_tracker.output_dir) / "details" / "test_model" / date_id
assert details_dir.exists()

for task in ["task1", "task2"]:
file_path = details_dir / f"details_{task}_{date_id}.parquet"
dataset = Dataset.from_parquet(str(file_path))
assert len(dataset) == 1
assert int(dataset[0]["truncated"]) == task_details[task][0].truncated
assert int(dataset[0]["padded"]) == task_details[task][0].padded


@pytest.mark.evaluation_tracker(save_details=False)
def test_no_details_output(mock_evaluation_tracker: EvaluationTracker):
mock_evaluation_tracker.save()

details_dir = Path(mock_evaluation_tracker.output_dir) / "details" / "test_model"
assert not details_dir.exists()


@pytest.mark.evaluation_tracker(push_to_hub=True, hub_results_org=TESTING_EMPTY_HF_ORG_ID)
def test_push_to_hub_works(testing_empty_hf_org_id, mock_evaluation_tracker: EvaluationTracker, mock_datetime):
# Prepare the dummy data
task_metrics = {
"task1": {"accuracy": 0.8, "f1": 0.75},
"task2": {"precision": 0.9, "recall": 0.85},
}
mock_evaluation_tracker.metrics_logger.metric_aggregated = task_metrics

task_details = {
"task1": [DetailsLogger.CompiledDetail(truncated=10, padded=5)],
"task2": [DetailsLogger.CompiledDetail(truncated=20, padded=10)],
}
mock_evaluation_tracker.details_logger.details = task_details
mock_evaluation_tracker.save()

# Verify using HfApi
api = HfApi()

# Check if repo exists and it's private
expected_repo_id = f"{testing_empty_hf_org_id}/details_test_model_private"
assert api.repo_exists(repo_id=expected_repo_id, repo_type="dataset")
assert api.repo_info(repo_id=expected_repo_id, repo_type="dataset").private

repo_files = api.list_repo_files(repo_id=expected_repo_id, repo_type="dataset")
# Check if README.md exists
assert any(file == "README.md" for file in repo_files)

# Check that both results files were uploaded
result_files = [file for file in repo_files if file.startswith("results_")]
assert len(result_files) == 2
assert len([file for file in result_files if file.endswith(".json")]) == 1
assert len([file for file in result_files if file.endswith(".parquet")]) == 1

# Check that the details dataset was uploaded
details_files = [file for file in repo_files if "details_" in file and file.endswith(".parquet")]
assert len(details_files) == 2
Loading