diff --git a/lenskit/lenskit/data/collection.py b/lenskit/lenskit/data/collection.py index 665e2f004..c98e24e86 100644 --- a/lenskit/lenskit/data/collection.py +++ b/lenskit/lenskit/data/collection.py @@ -225,6 +225,7 @@ def save_parquet( layout: Literal["native", "flat"] = "native", batch_size: int = 5000, compression: Literal["zstd", "gzip", "snappy", "lz4"] | None = "zstd", + mkdir: bool = True, ) -> None: """ Save this item list collection to a Parquet file. This supports two @@ -241,7 +242,12 @@ def save_parquet( The Arrow record batch size. compression: The compression scheme to use. + mkdir: + Whether to create the parent directories if they don't exist. """ + if mkdir: + Path(path).parent.mkdir(parents=True, exist_ok=True) + if layout == "flat": self.to_df().to_parquet(path, compression=compression) return diff --git a/lenskit/tests/data/test_collection.py b/lenskit/tests/data/test_collection.py index e57824e5e..cb932e908 100644 --- a/lenskit/tests/data/test_collection.py +++ b/lenskit/tests/data/test_collection.py @@ -248,6 +248,18 @@ def test_save_parquet_with_empty(ml_ds: Dataset, tmpdir: Path): assert sum(len(l1) for l1 in ilc2.lists()) == sum(len(l2) for l2 in ilc.lists()) +def test_save_parquet_with_mkdir(tmpdir: Path): + ilc = ItemListCollection(["user_id"]) + + f = tmpdir / "subdir" / "items.parquet" + ilc.save_parquet(f, mkdir=True) + assert (tmpdir / "subdir").exists() + + f_no_mkdir = tmpdir / "no_mkdir" / "items.parquet" + ilc.save_parquet(f_no_mkdir, mkdir=False) + assert not (tmpdir / "no_mkdir").exists() + + def test_write_recs_parquet(demo_recs, tmpdir: Path): split, recs = demo_recs