Skip to content

Commit 06ae3f6

Browse files
authored
Format code with ruff (#5519)
* Update config files * Format code * Some manual fixes * Fix
1 parent 819bc6e commit 06ae3f6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+52
-105
lines changed

.github/workflows/ci.yml

+1-2
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ jobs:
2828
- name: Check quality
2929
run: |
3030
black --check tests src benchmarks metrics
31-
isort --check-only tests src benchmarks metrics
32-
flake8 tests src benchmarks metrics
31+
ruff tests src benchmarks metrics
3332
3433
test:
3534
needs: check_code_quality

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,7 @@ docs/source/_build/
6161

6262
# Benchmark results
6363
report.json
64-
report.md
64+
report.md
65+
66+
# Ruff
67+
.ruff_cache

CONTRIBUTING.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ If you want to add a dataset see specific instructions in the section [*How to a
5757

5858
5. Develop the features on your branch.
5959

60-
6. Format your code. Run black and isort so that your newly added files look nice with the following command:
60+
6. Format your code. Run black and ruff so that your newly added files look nice with the following command:
6161

6262
```bash
6363
make style

Makefile

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
.PHONY: quality style test
22

3+
check_dirs := tests src benchmarks metrics
4+
35
# Check that source code meets quality standards
46

57
quality:
6-
black --check tests src benchmarks metrics
7-
isort --check-only tests src benchmarks metrics
8-
flake8 tests src benchmarks metrics
8+
black --check $(check_dirs)
9+
ruff $(check_dirs)
910

1011
# Format source code automatically
1112

1213
style:
1314
black tests src benchmarks metrics
14-
isort tests src benchmarks metrics
15+
ruff $(check_dirs) --fix
1516

1617
# Run tests for the library
1718

benchmarks/format.py

-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ def format_json_to_md(input_json_file, output_md_file):
99
output_md = ["<details>", "<summary>Show updated benchmarks!</summary>", " "]
1010

1111
for benchmark_name in sorted(results):
12-
1312
benchmark_res = results[benchmark_name]
1413

1514
benchmark_file_name = benchmark_name.split("/")[-1]

metrics/bleurt/bleurt.py

-2
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@
7878
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
7979
class BLEURT(datasets.Metric):
8080
def _info(self):
81-
8281
return datasets.MetricInfo(
8382
description=_DESCRIPTION,
8483
citation=_CITATION,
@@ -95,7 +94,6 @@ def _info(self):
9594
)
9695

9796
def _download_and_prepare(self, dl_manager):
98-
9997
# check that config name specifies a valid BLEURT model
10098
if self.config_name == "default":
10199
logger.warning(

metrics/code_eval/execute.py

-2
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,7 @@ def check_correctness(check_program, timeout, task_id, completion_id):
5454

5555

5656
def unsafe_execute(check_program, result, timeout):
57-
5857
with create_tempdir():
59-
6058
# These system calls are needed when cleaning up tempdir.
6159
import os
6260
import shutil

metrics/comet/comet.py

-1
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@
108108
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
109109
class COMET(datasets.Metric):
110110
def _info(self):
111-
112111
return datasets.MetricInfo(
113112
description=_DESCRIPTION,
114113
citation=_CITATION,

metrics/coval/coval.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
""" CoVal metric. """
15-
import coval # From: git+https://github.com/ns-moosavi/coval.git noqa: F401
15+
import coval # From: git+https://github.com/ns-moosavi/coval.git # noqa: F401
1616
from coval.conll import reader, util
1717
from coval.eval import evaluator
1818

@@ -167,7 +167,6 @@
167167
def get_coref_infos(
168168
key_lines, sys_lines, NP_only=False, remove_nested=False, keep_singletons=True, min_span=False, doc="dummy_doc"
169169
):
170-
171170
key_doc_lines = {doc: key_lines}
172171
sys_doc_lines = {doc: sys_lines}
173172

metrics/exact_match/exact_match.py

-1
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ def _compute(
108108
ignore_punctuation=False,
109109
ignore_numbers=False,
110110
):
111-
112111
if regexes_to_ignore is not None:
113112
for s in regexes_to_ignore:
114113
predictions = np.array([re.sub(s, "", x) for x in predictions])

metrics/indic_glue/indic_glue.py

-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
import numpy as np
1717
from scipy.spatial.distance import cdist
18-
from scipy.stats import pearsonr, spearmanr
1918
from sklearn.metrics import f1_score
2019

2120
import datasets

metrics/mae/mae.py

-1
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,6 @@ def _get_feature_types(self):
106106
}
107107

108108
def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average"):
109-
110109
mae_score = mean_absolute_error(references, predictions, sample_weight=sample_weight, multioutput=multioutput)
111110

112111
return {"mae": mae_score}

metrics/mahalanobis/mahalanobis.py

-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ def _info(self):
7171
)
7272

7373
def _compute(self, X, reference_distribution):
74-
7574
# convert to numpy arrays
7675
X = np.array(X)
7776
reference_distribution = np.array(reference_distribution)

metrics/mauve/mauve.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414
# limitations under the License.
1515
""" MAUVE metric from https://github.com/krishnap25/mauve. """
1616

17-
import faiss # Here to have a nice missing dependency error message early on
18-
import numpy # Here to have a nice missing dependency error message early on
19-
import requests # Here to have a nice missing dependency error message early on
20-
import sklearn # Here to have a nice missing dependency error message early on
21-
import tqdm # Here to have a nice missing dependency error message early on
17+
import faiss # noqa: F401 # Here to have a nice missing dependency error message early on
18+
import numpy # noqa: F401 # Here to have a nice missing dependency error message early on
19+
import requests # noqa: F401 # Here to have a nice missing dependency error message early on
20+
import sklearn # noqa: F401 # Here to have a nice missing dependency error message early on
21+
import tqdm # noqa: F401 # Here to have a nice missing dependency error message early on
2222
from mauve import compute_mauve # From: mauve-text
2323

2424
import datasets

metrics/mse/mse.py

-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ def _get_feature_types(self):
110110
}
111111

112112
def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average", squared=True):
113-
114113
mse = mean_squared_error(
115114
references, predictions, sample_weight=sample_weight, multioutput=multioutput, squared=squared
116115
)

metrics/perplexity/perplexity.py

-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ def _info(self):
101101
)
102102

103103
def _compute(self, input_texts, model_id, batch_size: int = 16, add_start_token: bool = True, device=None):
104-
105104
if device is not None:
106105
assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
107106
if device == "gpu":

metrics/rouge/rouge.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@
1414
""" ROUGE metric from Google Research github repo. """
1515

1616
# The dependencies in https://github.com/google-research/google-research/blob/master/rouge/requirements.txt
17-
import absl # Here to have a nice missing dependency error message early on
18-
import nltk # Here to have a nice missing dependency error message early on
19-
import numpy # Here to have a nice missing dependency error message early on
20-
import six # Here to have a nice missing dependency error message early on
17+
import absl # noqa: F401 # Here to have a nice missing dependency error message early on
18+
import nltk # noqa: F401 # Here to have a nice missing dependency error message early on
19+
import numpy # noqa: F401 # Here to have a nice missing dependency error message early on
20+
import six # noqa: F401 # Here to have a nice missing dependency error message early on
2121
from rouge_score import rouge_scorer, scoring
2222

2323
import datasets

metrics/sari/sari.py

-2
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,6 @@ def SARIsent(ssent, csent, rsents):
227227

228228

229229
def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_str: bool = True):
230-
231230
# Normalization is requried for the ASSET dataset (one of the primary
232231
# datasets in sentence simplification) to allow using space
233232
# to split the sentence. Even though Wiki-Auto and TURK datasets,
@@ -278,7 +277,6 @@ def _info(self):
278277
)
279278

280279
def _compute(self, sources, predictions, references):
281-
282280
if not (len(sources) == len(predictions) == len(references)):
283281
raise ValueError("Sources length must match predictions and references lengths.")
284282
sari_score = 0

metrics/super_glue/super_glue.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def evaluate_multirc(ids_preds, labels):
135135
question_preds, question_labels = zip(*preds_labels)
136136
f1 = f1_score(y_true=question_labels, y_pred=question_preds, average="macro")
137137
f1s.append(f1)
138-
em = int(sum(p == l for p, l in preds_labels) == len(preds_labels))
138+
em = int(sum(pred == label for pred, label in preds_labels) == len(preds_labels))
139139
ems.append(em)
140140
f1_m = float(sum(f1s) / len(f1s))
141141
em = sum(ems) / len(ems)

metrics/wiki_split/wiki_split.py

-2
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,6 @@ def SARIsent(ssent, csent, rsents):
254254

255255

256256
def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_str: bool = True):
257-
258257
# Normalization is requried for the ASSET dataset (one of the primary
259258
# datasets in sentence simplification) to allow using space
260259
# to split the sentence. Even though Wiki-Auto and TURK datasets,
@@ -284,7 +283,6 @@ def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_s
284283

285284

286285
def compute_sari(sources, predictions, references):
287-
288286
if not (len(sources) == len(predictions) == len(references)):
289287
raise ValueError("Sources length must match predictions and references lengths.")
290288
sari_score = 0

metrics/xtreme_s/xtreme_s.py

-1
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,6 @@ def _info(self):
238238
)
239239

240240
def _compute(self, predictions, references, bleu_kwargs=None, wer_kwargs=None):
241-
242241
bleu_kwargs = bleu_kwargs if bleu_kwargs is not None else {}
243242
wer_kwargs = wer_kwargs if wer_kwargs is not None else {}
244243

pyproject.toml

+12
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
11
[tool.black]
22
line-length = 119
33
target_version = ['py37']
4+
5+
[tool.ruff]
6+
# Ignored rules:
7+
# "E501" -> line length violation
8+
# "F821" -> undefined named in type annotation (e.g. Literal["something"])
9+
ignore = ["E501", "F821"]
10+
select = ["E", "F", "I", "W"]
11+
line-length = 119
12+
13+
[tool.ruff.isort]
14+
lines-after-imports = 2
15+
known-first-party = ["datasets"]

setup.cfg

-18
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,6 @@
11
[metadata]
22
license_file = LICENSE
33

4-
[isort]
5-
ensure_newline_before_comments = True
6-
force_grid_wrap = 0
7-
include_trailing_comma = True
8-
line_length = 119
9-
lines_after_imports = 2
10-
multi_line_output = 3
11-
use_parentheses = True
12-
13-
[flake8]
14-
ignore = E203, E501, W503
15-
max-line-length = 119
16-
exclude =
17-
src/datasets/datasets
18-
src/datasets/metrics
19-
per-file-ignores =
20-
metrics/*:F401
21-
224
[tool:pytest]
235
markers =
246
unit: unit test

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@
211211
TESTS_REQUIRE.extend(VISION_REQUIRE)
212212
TESTS_REQUIRE.extend(AUDIO_REQUIRE)
213213

214-
QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0", "pyyaml>=5.3.1"]
214+
QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241", "pyyaml>=5.3.1"]
215215

216216
DOCS_REQUIRE = [
217217
# Might need to add doc-builder and some specific deps in the future

src/datasets/commands/dummy_data.py

-1
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,6 @@ def _print_dummy_data_instructions(self, dataset_builder, mock_dl_manager):
394394
try:
395395
generator_splits = dataset_builder._split_generators(mock_dl_manager)
396396
except FileNotFoundError as e:
397-
398397
print(
399398
f"Dataset {self._dataset_name} with config {mock_dl_manager.config} seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file {e.filename}."
400399
)

src/datasets/features/features.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,8 @@
2323
from dataclasses import InitVar, dataclass, field, fields
2424
from functools import reduce, wraps
2525
from operator import mul
26-
from typing import Any, ClassVar, Dict, List, Optional
26+
from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union
2727
from typing import Sequence as Sequence_
28-
from typing import Tuple, Union
2928

3029
import numpy as np
3130
import pandas as pd
@@ -1763,7 +1762,6 @@ def unsimplify(feature: dict) -> dict:
17631762
return feature
17641763

17651764
def from_yaml_inner(obj: Union[dict, list]) -> Union[dict, list]:
1766-
17671765
if isinstance(obj, dict):
17681766
if not obj:
17691767
return {}

src/datasets/fingerprint.py

-2
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,6 @@ def get_temporary_cache_files_directory() -> str:
184184
"""Return a directory that is deleted when session closes."""
185185
global _TEMP_DIR_FOR_TEMP_CACHE_FILES
186186
if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
187-
188187
# Avoids a PermissionError on Windows caused by the datasets referencing
189188
# the files from the cache directory on clean-up
190189
def cleanup_func():
@@ -466,7 +465,6 @@ def fingerprint_transform(
466465
fingerprint_names = fingerprint_names if fingerprint_names is not None else ["new_fingerprint"]
467466

468467
def _fingerprint(func):
469-
470468
if not inplace and not all(name in func.__code__.co_varnames for name in fingerprint_names):
471469
raise ValueError("function {func} is missing parameters {fingerprint_names} in signature")
472470

src/datasets/formatting/np_formatter.py

-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def _consolidate(self, column):
4444
return column
4545

4646
def _tensorize(self, value):
47-
4847
if isinstance(value, (str, bytes, type(None))):
4948
return value
5049
elif isinstance(value, (np.character, np.ndarray)) and np.issubdtype(value.dtype, np.character):

src/datasets/io/csv.py

-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ def __init__(
7474
num_proc: Optional[int] = None,
7575
**to_csv_kwargs,
7676
):
77-
7877
if num_proc is not None and num_proc <= 0:
7978
raise ValueError(f"num_proc {num_proc} must be an integer > 0.")
8079

src/datasets/io/sql.py

-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ def __init__(
6464
num_proc: Optional[int] = None,
6565
**to_sql_kwargs,
6666
):
67-
6867
if num_proc is not None and num_proc <= 0:
6968
raise ValueError(f"num_proc {num_proc} must be an integer > 0.")
7069

src/datasets/naming.py

-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix
6767

6868

6969
def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None):
70-
7170
prefix = filename_prefix_for_split(dataset_name, split)
7271
prefix = os.path.join(path, prefix)
7372

src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
133133

134134
if metadata_files:
135135
# add metadata if `metadata_files` are found and `drop_metadata` is None (default) or False
136-
add_metadata = not (self.config.drop_metadata is True)
136+
add_metadata = not self.config.drop_metadata
137137
# if `metadata_files` are found, add labels only if
138138
# `drop_labels` is set up to False explicitly (not-default behavior)
139139
add_labels = self.config.drop_labels is False

src/datasets/packaged_modules/json/json.py

-1
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@ def _cast_table(self, pa_table: pa.Table) -> pa.Table:
9393

9494
def _generate_tables(self, files):
9595
for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
96-
9796
# If the file is one json object and if we need to look at the list of items in one specific field
9897
if self.config.field is not None:
9998
with open(file, encoding="utf-8") as f:

src/datasets/utils/file_utils.py

-2
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@ def head_hf_s3(
100100

101101

102102
def hf_github_url(path: str, name: str, dataset=True, revision: Optional[str] = None) -> str:
103-
104103
default_revision = "main" if version.parse(__version__).is_devrelease else __version__
105104
revision = revision or default_revision
106105
if dataset:
@@ -547,7 +546,6 @@ def get_from_cache(
547546
# Prevent parallel downloads of the same file with a lock.
548547
lock_path = cache_path + ".lock"
549548
with FileLock(lock_path):
550-
551549
if resume_download:
552550
incomplete_path = cache_path + ".incomplete"
553551

0 commit comments

Comments
 (0)