diff --git a/transforms/README-list.md b/transforms/README-list.md index 8e55d1df5..08d4e76c8 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -45,6 +45,7 @@ Note: This list includes the transforms that were part of the release starting w ### 1.0.1.dev1 Added Gneissweb transforms + fdedup fix for windows ### 1.0.1.dev0 PR #979 (code_profiler) ### 1.0.0.a6 diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index 517d039d4..225a73b7e 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.3 +data-prep-toolkit[ray]>=0.2.4.dev0 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5 diff --git a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py index b414adaa6..06c02f553 100644 --- a/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py @@ -130,7 +130,14 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str band = int(match.group(1)) segment = int(match.group(2)) else: - raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s") + match = re.match(r"^band=(\d+)\\segment=(\d+)$", folder_name) + if match: + band = int(match.group(1)) + segment = int(match.group(2)) + else: + raise ValueError( + f"Wrong folder_name {folder_name}, should be either band=b/segment=s or band=b\\segment=s (windows)" + ) output_folder = TransformUtils.clean_path(self.data_access.output_folder) output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet") diff --git a/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py index d01ee7b85..50630bd80 100644 --- a/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py @@ -394,7 +394,7 @@ def _emit_bands(self, int_id_column: str, minhashes: np.array, b: int, r: int, s results = [] for band_index in range(b): band_hash, _ = mmh3.hash64( - minhashes[band_index * r : (band_index + 1) * r], + minhashes[band_index * r : (band_index + 1) * r].tobytes(), seed=seed, signed=False, ) diff --git a/transforms/universal/fdedup/requirements.txt b/transforms/universal/fdedup/requirements.txt index 42af99d8b..ee84fb6eb 100644 --- a/transforms/universal/fdedup/requirements.txt +++ b/transforms/universal/fdedup/requirements.txt @@ -1,9 +1,9 @@ pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 -polars==1.9.0 +polars>=1.9.0, !=1.10.0, !=1.11.0, !=1.12.0 disjoint-set>=0.8.0 scipy>=1.12.1, <2.0.0 numpy<1.29.0 sentencepiece>=0.2.0 -mmh3>=4.1.0, <=5.0.1 +mmh3>=4.1.0