Skip to content

Commit 479145b

Browse files
committed
First version of dataset verification, addressing #17
1 parent 6b5156f commit 479145b

File tree

4 files changed

+96
-0
lines changed

4 files changed

+96
-0
lines changed

nc2zarr/cli.py

+6
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@
5353
@click.option('--sort-by', '-s', 'sort_by', default=None,
5454
type=click.Choice(['path', 'name'], case_sensitive=True),
5555
help='Sort input files by specified property.')
56+
@click.option('--verify', 'verify', type=click.Choice(["on", 'off', 'auto']),
57+
default='auto',
58+
help='Switch verification either on, or off,'
59+
' or leave it up to CONFIG_FILE (=auto, the default).')
5660
@click.option('--dry-run', '-d', 'dry_run', is_flag=True, default=None,
5761
help='Open and process inputs only, omit data writing.')
5862
@click.option('--verbose', '-v', 'verbose', is_flag=True, multiple=True,
@@ -68,6 +72,7 @@ def nc2zarr(input_paths: Tuple[str],
6872
append: bool,
6973
decode_cf: bool,
7074
sort_by: str,
75+
verify: str,
7176
dry_run: bool,
7277
verbose: Tuple[bool],
7378
version: bool):
@@ -129,6 +134,7 @@ def nc2zarr(input_paths: Tuple[str],
129134
output_path=output_path,
130135
output_overwrite=overwrite,
131136
output_append=append,
137+
verify_enabled=True if verify == 'on' else False if verify == 'off' else None,
132138
verbosity=sum(verbose) if verbose else None,
133139
dry_run=dry_run)
134140
Converter(**config_kwargs).run()

nc2zarr/converter.py

+14
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from .preprocessor import DatasetPreProcessor
3232
from .processor import DatasetProcessor
3333
from .writer import DatasetWriter
34+
from .verifier import DatasetVerifier
3435

3536

3637
class Converter:
@@ -55,6 +56,8 @@ class Converter:
5556
:param output_append:
5657
:param output_append_dim:
5758
:param output_s3:
59+
:param verify_enabled:
60+
:param verify_open_params:
5861
:param dry_run:
5962
:param verbosity:
6063
"""
@@ -79,6 +82,8 @@ def __init__(self,
7982
output_append_dim: str = None,
8083
output_s3: Dict[str, Any] = None,
8184
output_retry: Dict[str, Any] = None,
85+
verify_enabled: bool = None,
86+
verify_open_params: Dict[str, Any] = None,
8287
dry_run: bool = False,
8388
verbosity: int = None):
8489

@@ -114,6 +119,8 @@ def __init__(self,
114119
self.output_append_dim = output_append_dim
115120
self.output_s3 = output_s3
116121
self.output_retry = output_retry
122+
self.verify_enabled = verify_enabled
123+
self.verify_open_params = verify_open_params
117124
self.dry_run = dry_run
118125
self.verbosity = verbosity
119126

@@ -153,9 +160,16 @@ def _run(self):
153160
dry_run=self.dry_run,
154161
reset_attrs=not self.input_decode_cf)
155162

163+
dataset_verifier = DatasetVerifier(output_path=self.output_path,
164+
output_s3_kwargs=self.output_s3,
165+
verify_enabled=self.verify_enabled,
166+
verify_open_params=self.verify_open_params)
167+
156168
append = None
157169
for input_dataset in opener.open_datasets(preprocess=pre_processor.preprocess_dataset):
158170
output_dataset, output_encoding = processor.process_dataset(input_dataset)
159171
writer.write_dataset(output_dataset, encoding=output_encoding, append=append)
160172
input_dataset.close()
161173
append = True
174+
175+
dataset_verifier.verify_dataset()

nc2zarr/res/config-template.yml

+8
Original file line numberDiff line numberDiff line change
@@ -163,3 +163,11 @@ output:
163163
delay: 0.1
164164
# multiplier applied to delay between attempts. 1.0 means no backoff.
165165
backoff: 1.1
166+
167+
168+
verify:
169+
enabled: true
170+
171+
open_params:
172+
consolidated: false
173+
decode_cf: false

nc2zarr/verifier.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# The MIT License (MIT)
2+
# Copyright (c) 2021 by Brockmann Consult GmbH and contributors
3+
#
4+
# Permission is hereby granted, free of charge, to any person obtaining a copy of
5+
# this software and associated documentation files (the "Software"), to deal in
6+
# the Software without restriction, including without limitation the rights to
7+
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
8+
# of the Software, and to permit persons to whom the Software is furnished to do
9+
# so, subject to the following conditions:
10+
#
11+
# The above copyright notice and this permission notice shall be included in all
12+
# copies or substantial portions of the Software.
13+
#
14+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20+
# SOFTWARE.
21+
22+
from typing import Dict, Any
23+
24+
import fsspec
25+
import fsspec.implementations.local
26+
import xarray as xr
27+
28+
from .log import LOGGER
29+
from .log import log_duration
30+
31+
32+
class DatasetVerifier:
33+
def __init__(self,
34+
output_path: str,
35+
output_s3_kwargs: Dict[str, Any] = None,
36+
verify_enabled: Dict[str, Any] = None,
37+
verify_open_params: Dict[str, Any] = None,
38+
dry_run: bool = False):
39+
if not output_path:
40+
raise ValueError('output_path must be given')
41+
self._output_path = output_path
42+
self._output_s3_kwargs = output_s3_kwargs
43+
self._verify_enabled = verify_enabled
44+
self._verify_open_params = verify_open_params or {}
45+
self._dry_run = dry_run
46+
47+
def verify_dataset(self):
48+
49+
if not self._verify_enabled:
50+
LOGGER.info('Dataset verification disabled.')
51+
return
52+
53+
with log_duration(f'Verifying dataset'):
54+
if not self._dry_run:
55+
if self._output_s3_kwargs or self._output_path.startswith('s3://'):
56+
fs = fsspec.filesystem('s3', **(self._output_s3_kwargs or {}))
57+
else:
58+
fs = fsspec.filesystem('file')
59+
store = fs.get_mapper(self._output_path, check=False, create=False)
60+
# noinspection PyBroadException
61+
try:
62+
dataset = xr.open_zarr(store, **self._verify_open_params)
63+
LOGGER.info(dataset)
64+
LOGGER.info('Dataset verification passed.')
65+
except BaseException as e:
66+
LOGGER.error('Dataset verification failed!')
67+
else:
68+
LOGGER.info('Dataset verification skipped, it is a dry run.')

0 commit comments

Comments
 (0)