Skip to content

Commit

Permalink
Merge pull request #593 from samiravaez/add-feature
Browse files Browse the repository at this point in the history
Add directory check in save_parquet
  • Loading branch information
mdekstrand authored Jan 7, 2025
2 parents 3b79a8a + cb9d14b commit 91216fd
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 0 deletions.
6 changes: 6 additions & 0 deletions lenskit/lenskit/data/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def save_parquet(
layout: Literal["native", "flat"] = "native",
batch_size: int = 5000,
compression: Literal["zstd", "gzip", "snappy", "lz4"] | None = "zstd",
mkdir: bool = True,
) -> None:
"""
Save this item list collection to a Parquet file. This supports two
Expand All @@ -241,7 +242,12 @@ def save_parquet(
The Arrow record batch size.
compression:
The compression scheme to use.
mkdir:
Whether to create the parent directories if they don't exist.
"""
if mkdir:
Path(path).parent.mkdir(parents=True, exist_ok=True)

if layout == "flat":
self.to_df().to_parquet(path, compression=compression)
return
Expand Down
12 changes: 12 additions & 0 deletions lenskit/tests/data/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,18 @@ def test_save_parquet_with_empty(ml_ds: Dataset, tmpdir: Path):
assert sum(len(l1) for l1 in ilc2.lists()) == sum(len(l2) for l2 in ilc.lists())


def test_save_parquet_with_mkdir(tmpdir: Path):
ilc = ItemListCollection(["user_id"])

f = tmpdir / "subdir" / "items.parquet"
ilc.save_parquet(f, mkdir=True)
assert (tmpdir / "subdir").exists()

f_no_mkdir = tmpdir / "no_mkdir" / "items.parquet"
ilc.save_parquet(f_no_mkdir, mkdir=False)
assert not (tmpdir / "no_mkdir").exists()


def test_write_recs_parquet(demo_recs, tmpdir: Path):
split, recs = demo_recs

Expand Down

0 comments on commit 91216fd

Please sign in to comment.