Merge branch 'IBM:dev' into dev

Signed-off-by: Pankaj Thorat <[email protected]>
IBM · Jan 29, 2025 · db01dfd · db01dfd
2 parents ad5dfcd + df61f49
commit db01dfd
Show file tree

Hide file tree

Showing 105 changed files with 9,933 additions and 980 deletions.
diff --git a/.github/mkdocs_hook.py b/.github/mkdocs_hook.py
@@ -76,7 +76,11 @@ def update_markdown_content_updated(markdown: str, replacements: List[Tuple[str,
     if len(replacements) > 0:
         for text, old_value, new_value in replacements:
             log.info(f"Updating Link: text: [{text}], link: {old_value}, updated link: {new_value}")
-            markdown = update_link(markdown, text, old_value, new_value)
+            try:
+                markdown = update_link(markdown, text, old_value, new_value)
+            except Exception as e:
+                log.info(f"Failed to update link: link: {old_value}, updated link: {new_value} due to {e}")
+
     return markdown
 
 

diff --git a/MAINTAINERS.md b/MAINTAINERS.md
@@ -1,8 +1,8 @@
 # MAINTAINERS
 
-David Wood - dawood@us.ibm.com
+Maroun Touma - touma@us.ibm.com
 
-Boris Lublinsky - blublinsky@ibm.com
+Shivdeep Singh - shivdeep.singh@ibm.com
 
 Revital Eres - [email protected]
 
diff --git a/data-processing-lib/python/src/data_processing/utils/multilock.py b/data-processing-lib/python/src/data_processing/utils/multilock.py
@@ -12,7 +12,9 @@
 
 import abc
 import datetime
-import fcntl
+import platform
+if platform.system() != 'Windows':
+   import fcntl
 import os
 import tempfile
 import threading
@@ -98,7 +100,8 @@ def acquire(self, block=True, timeout=None):
             timeout = max(0, timeout)
         while not locked and (timeout is None or waited <= timeout):
             try:
-                fcntl.lockf(self.fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                if platform.system() != 'Windows':
+                    fcntl.lockf(self.fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
                 locked = True
             except Exception as exc:
                 # Only get here if lock could not be acquired

diff --git a/doc/quick-start/contribute-your-own-transform.md b/doc/quick-start/contribute-your-own-transform.md
@@ -37,6 +37,7 @@ The new transform we will build  as part of this tutorial is meant to annotate e
 1. [Integrate with CI/CD](#cicd) - automate testing, integration and packaging
 1. [Create notebook](#notebook) - jupyter notebook showing how the transform can be invoked
 1. [Create Readme file](#readme) - Readme file explaining how the transform is used
+1. (Optional)[Setup for KFP Pipeline](#kfp) - Create artifacts for integrating with KFP Workflow
 
 
 ## Step 1: Create folder structure <a name=setup></a>
@@ -493,6 +494,113 @@ The notebook should show how to run the notebook from the current folder. Guidan
 The README file for the transform should have, at a minimum, the following sections: Summary, Contributors, Configuration and command line options, an Example of how to run from command line and link to a notebook. If applicable, it should have more sections on Troubleshooting, Transforming data using the transform image and sections on Ray and/or Spark versions of the transform. 
 [This](https://github.com/mt747/data-prep-kit/blob/block_digest/transforms/universal/digest/README.md) is a minimal README file for our digest transform. 
 
+## Step 10: Setup KFP Pipeline  <a name="kfp"></a>
+
+It might be desirable to build a KFP pipeline chaining multiple transforms together. In this section, we will cover the steps that the developer needs to do so the Operation team can create a pipeline that is tailored to their specific use case. We will only conver the artifact that the developer needs to produce to enable the integration of the digest transform in a KFP pipeline
+
+**kfP-ray/Makefile**
+
+- Create folder to host KFP related artifacts
+
+```shell
+cd data-prep-kit/transforms/universal/digest
+mkdir -p kfp_ray
+cp ../../Makefile.kfp.template kfp_ray/Makefile
+```
+
+**kfP-ray/digest_wf.py**
+
+- Create KFP definition file. This file will be used to produce the kfp workflow yaml definition file. The full content of this file in available [here](https://github.com/mt747/data-prep-kit/blob/13be7f4349e498041afe9834b1961d158728316a/transforms/universal/digest/kfp_ray/digest_wf.py). We only highlight some of the key elements.
+
+
+- this file define the reference to the docker image for the transform and entry point:
+
+    * task_image = "quay.io/dataprep1/data-prep-kit/digest-ray:latest"
+    * EXEC_SCRIPT_NAME: str = "-m dpk_digest.ray.runtime"
+
+
+```Python
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import os
+
+import kfp.compiler as compiler
+import kfp.components as comp
+import kfp.dsl as dsl
+from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils
+
+
+task_image = "quay.io/dataprep1/data-prep-kit/digest-ray:latest"
+
+# the name of the job script
+EXEC_SCRIPT_NAME: str = "-m dpk_digest.ray.runtime"
+# components
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3"
+
+# path to kfp component specifications files
+component_spec_path = "../../../../kfp/kfp_ray_components/"
+
+```
+
+
+- It defines the list of configuration parameters that are required by the framework return as a dictionay structure:
+
+    * "digest_algorithm": digest_algorithm,
+
+```Python
+def compute_exec_params_func(
+    ...
+    ...
+    digest_algorithm: str,
+) -> dict:
+    return {
+        ...
+        ...
+        "digest_algorithm": digest_algorithm,
+    }
+```
+
+- It assigns a name to this workflow task:
+
+    * TASK_NAME: str = "digest"
+
+```Python
+# Task name is part of the pipeline name, the ray cluster name and the job name in DMF.
+TASK_NAME: str = "digest"
+```
+
+    * Pipeline definition method and default values:
+
+```Python    
+ @dsl.pipeline(
+    name=TASK_NAME + "-ray-pipeline",
+    description="Pipeline for digest",
+    )
+def digest(
+    ###
+    ...
+    ###
+):
+```
+
+- It defines the __main__ entry point for compiling the yaml file required for running kfp
+
+```Python
+if __name__ == "__main__":
+    # Compiling the pipeline
+    compiler.Compiler().compile(digest, __file__.replace(".py", ".yaml"))
+```
+
+
 ## Contributors
 
 - Maroun Touma ([email protected])

diff --git a/examples/data-files/fine-tuning/language/MSA-2.pdf b/examples/data-files/fine-tuning/language/MSA-2.pdf
diff --git a/examples/data-files/fine-tuning/language/MSA-3.pdf b/examples/data-files/fine-tuning/language/MSA-3.pdf
diff --git a/examples/data-files/fine-tuning/language/MSA-DPK-1.pdf b/examples/data-files/fine-tuning/language/MSA-DPK-1.pdf
diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md
@@ -21,6 +21,7 @@ Another useful feature of the KFP v2 is the `Json` editor for the `dict` type in
 - It creates just one run that includes all the nested transfroms and their sub-tasks.
 - No need for additional component as `executeSubWorkflowComponent.yaml`. All the implementation in the same pipeline file.
 - In superpipelines of KFP v1 there exists an option to override the common parameters with specific values for each one of the transforms. This option is missing in the KFP v2 superpipelines.
+- In kfp V2 pipelines the user is requested to insert a unique string for the ray cluster created at run creation time (called `ray_run_id_KFPv2`). This is because in KFPv2 `dsl.RUN_ID_PLACEHOLDER` is deprecated and cannot be used since SDK 2.5.0 and we cannot generate a unique string at run-time, see https://github.com/kubeflow/pipelines/issues/10187.
 
 ### How to compile the superpipeline
 ```

diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py
@@ -57,6 +57,7 @@ def super_pipeline(
     p1_pipeline_data_max_files: int = -1,
     p1_pipeline_data_num_samples: int = -1,
     p1_pipeline_data_checkpointing: bool = False,
+    p1_pipeline_ray_run_id_KFPv2: str = "",
     # noop step parameters
     p2_name: str = "noop",
     p2_skip: bool = False,