Skip to content

Commit 3a3b266

Browse files
authored
Merge pull request #152 from jerabaul29/feat/illustrate_compression
Add example of compression, chunking by trajectory
2 parents fcedbb8 + bbd6e09 commit 3a3b266

File tree

2 files changed

+17690
-0
lines changed

2 files changed

+17690
-0
lines changed

examples/example_compress.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""
2+
Examples of compressing data when saving to .nc
3+
==============================================================================
4+
"""
5+
6+
# %%
7+
8+
import xarray as xr
9+
from trajan.readers.omb import read_omb_csv
10+
from pathlib import Path
11+
import os
12+
13+
# %%
14+
15+
path_to_test_data = Path.cwd().parent / "tests" / "test_data" / "csv" / "omb_large.csv"
16+
xr_buoys = read_omb_csv(path_to_test_data)
17+
18+
# %%
19+
20+
# by default, to_netcdf does not perform any compression
21+
xr_buoys.to_netcdf("no_compression.nc")
22+
23+
# on my machine, this is around 33MB
24+
print(f"size no compression: {round(os.stat('no_compression.nc').st_size/(pow(1024,2)), 2)} MB")
25+
26+
# %%
27+
28+
# one can perform compression by providing explicitly the right arguments
29+
# note that the best way to compress may depend on your dataset, the access
30+
# pattern you want to be fastest, etc - be aware of memory layout and
31+
# performance!
32+
33+
# a simple compression, on a per-trajectory basis: each trajectory will
34+
# be compressed as a chunk, this means that it will be fast to retrieve one
35+
# full trajectory, but slow to retrieve e.g. the 5th point of all trajectories.
36+
37+
# choose the encoding chunking - this may be application dependent, here
38+
# chunk trajectory as a whole
39+
def generate_chunksize(var):
40+
dims = xr_buoys[var].dims
41+
shape = list(xr_buoys[var].shape)
42+
43+
idx_trajectory = dims.index("trajectory")
44+
shape[idx_trajectory] = 1
45+
46+
return tuple(shape)
47+
48+
49+
# set the encoding for each variable
50+
encoding = {
51+
var: {"zlib": True, "complevel": 5, "chunksizes": generate_chunksize(var)} \
52+
for var in xr_buoys.data_vars
53+
}
54+
55+
# the encoding looks like:
56+
for var in encoding:
57+
print(f"{var}: {encoding[var] = }")
58+
print("")
59+
60+
# save, this time with compression
61+
xr_buoys.to_netcdf("trajectory_compression.nc", encoding=encoding)
62+
63+
# on my machine, this is around 5.6MB
64+
print(f"size with compression: {round(os.stat('trajectory_compression.nc').st_size/(pow(1024,2)), 2)} MB")
65+
66+
# %%

0 commit comments

Comments
 (0)