From c69ebd6090b7c384172cd26a526c4ad42a9b610d Mon Sep 17 00:00:00 2001 From: Constantin Adam Date: Thu, 30 Jan 2025 12:29:01 -0500 Subject: [PATCH 1/7] Fixed incompatibility with mmh3>5.0.1 Signed-off-by: Constantin Adam --- .../universal/fdedup/dpk_fdedup/signature_calc/transform.py | 2 +- transforms/universal/fdedup/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py index d01ee7b85..50630bd80 100644 --- a/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py @@ -394,7 +394,7 @@ def _emit_bands(self, int_id_column: str, minhashes: np.array, b: int, r: int, s results = [] for band_index in range(b): band_hash, _ = mmh3.hash64( - minhashes[band_index * r : (band_index + 1) * r], + minhashes[band_index * r : (band_index + 1) * r].tobytes(), seed=seed, signed=False, ) diff --git a/transforms/universal/fdedup/requirements.txt b/transforms/universal/fdedup/requirements.txt index 42af99d8b..b28fac859 100644 --- a/transforms/universal/fdedup/requirements.txt +++ b/transforms/universal/fdedup/requirements.txt @@ -6,4 +6,4 @@ disjoint-set>=0.8.0 scipy>=1.12.1, <2.0.0 numpy<1.29.0 sentencepiece>=0.2.0 -mmh3>=4.1.0, <=5.0.1 +mmh3>=4.1.0 From 328943f4b2b2097ad0d16d9c66bfce91d539486e Mon Sep 17 00:00:00 2001 From: Constantin Adam Date: Thu, 30 Jan 2025 14:42:52 -0500 Subject: [PATCH 2/7] Allow polars>=1.13.0 Signed-off-by: Constantin Adam --- transforms/universal/fdedup/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/requirements.txt b/transforms/universal/fdedup/requirements.txt index b28fac859..a8b0354a7 100644 --- a/transforms/universal/fdedup/requirements.txt +++ b/transforms/universal/fdedup/requirements.txt @@ -1,7 +1,7 @@ pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 -polars==1.9.0 +polars>=1.13.0 disjoint-set>=0.8.0 scipy>=1.12.1, <2.0.0 numpy<1.29.0 From 86bbdedc66d089b6f37136dc2f8759fff5d023cd Mon Sep 17 00:00:00 2001 From: Constantin Adam Date: Thu, 30 Jan 2025 17:36:02 -0500 Subject: [PATCH 3/7] First attempt to fix windows bug Signed-off-by: Constantin Adam --- .../fdedup/dpk_fdedup/cluster_analysis/transform.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py index b414adaa6..cc7b30d68 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py @@ -125,7 +125,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str ) if retries > 0: metadata |= {"data_access_retries": retries} - match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name) + if os.sep == "\\": + match = re.match(r"^band=(\d+)\\segment=(\d+)$", folder_name) + else: + match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name) if match: band = int(match.group(1)) segment = int(match.group(2)) From e77afe7866e9f6b1afe7cc568ef90215e3aada19 Mon Sep 17 00:00:00 2001 From: Constantin Adam Date: Thu, 30 Jan 2025 20:42:23 -0500 Subject: [PATCH 4/7] Fixed polars version requirements Signed-off-by: Constantin Adam --- transforms/universal/fdedup/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/requirements.txt b/transforms/universal/fdedup/requirements.txt index a8b0354a7..ee84fb6eb 100644 --- a/transforms/universal/fdedup/requirements.txt +++ b/transforms/universal/fdedup/requirements.txt @@ -1,7 +1,7 @@ pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 -polars>=1.13.0 +polars>=1.9.0, !=1.10.0, !=1.11.0, !=1.12.0 disjoint-set>=0.8.0 scipy>=1.12.1, <2.0.0 numpy<1.29.0 From bc4b4c14520beb00f9f9cd84a82d220c3d43a67c Mon Sep 17 00:00:00 2001 From: Constantin Adam Date: Thu, 30 Jan 2025 20:43:42 -0500 Subject: [PATCH 5/7] Fix for issue #989 (fdedup fails on windows) Signed-off-by: Constantin Adam --- .../dpk_fdedup/cluster_analysis/transform.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py index cc7b30d68..06c02f553 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py @@ -125,15 +125,19 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str ) if retries > 0: metadata |= {"data_access_retries": retries} - if os.sep == "\\": - match = re.match(r"^band=(\d+)\\segment=(\d+)$", folder_name) - else: - match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name) + match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name) if match: band = int(match.group(1)) segment = int(match.group(2)) else: - raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s") + match = re.match(r"^band=(\d+)\\segment=(\d+)$", folder_name) + if match: + band = int(match.group(1)) + segment = int(match.group(2)) + else: + raise ValueError( + f"Wrong folder_name {folder_name}, should be either band=b/segment=s or band=b\\segment=s (windows)" + ) output_folder = TransformUtils.clean_path(self.data_access.output_folder) output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet") From 0dd3b71b959506733c1f286c88702213a585ccde Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 31 Jan 2025 16:36:59 -0500 Subject: [PATCH 6/7] Added to wheel Signed-off-by: Maroun Touma --- transforms/README-list.md | 1 + 1 file changed, 1 insertion(+) diff --git a/transforms/README-list.md b/transforms/README-list.md index 8e55d1df5..08d4e76c8 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -45,6 +45,7 @@ Note: This list includes the transforms that were part of the release starting w ### 1.0.1.dev1 Added Gneissweb transforms + fdedup fix for windows ### 1.0.1.dev0 PR #979 (code_profiler) ### 1.0.0.a6 From f827bb6cebfe3096b476411683a3ad8b0b15e2b8 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 31 Jan 2025 16:41:23 -0500 Subject: [PATCH 7/7] fix ray dependency Signed-off-by: Maroun Touma --- transforms/requirements-ray.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index 517d039d4..225a73b7e 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.3 +data-prep-toolkit[ray]>=0.2.4.dev0 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5