Address review comments

schustmi · schustmi · commit 0e4c865f83f0 · 2025-11-28T14:16:28.000+08:00
diff --git a/src/zenml/integrations/gcp/google_credentials_mixin.py b/src/zenml/integrations/gcp/google_credentials_mixin.py
@@ -48,6 +48,9 @@ class GoogleCredentialsConfigMixin(StackComponentConfig):
 class GoogleCredentialsMixin(StackComponent):
     """StackComponent mixin to get Google Cloud Platform credentials."""
 
+    _gcp_credentials: Optional[Credentials] = None
+    _gcp_project_id: Optional[str] = None
+
     @property
     def config(self) -> GoogleCredentialsConfigMixin:
         """Returns the `GoogleCredentialsConfigMixin` config.
@@ -57,6 +60,18 @@ def config(self) -> GoogleCredentialsConfigMixin:
         """
         return cast(GoogleCredentialsConfigMixin, self._config)
 
+    @property
+    def gcp_project_id(self) -> str:
+        """Get the GCP project ID.
+
+        Returns:
+            The GCP project ID.
+        """
+        if self._gcp_project_id is None:
+            _, self._gcp_project_id = self._get_authentication()
+
+        return self._gcp_project_id
+
     def _get_authentication(self) -> Tuple["Credentials", str]:
         """Get GCP credentials and the project ID associated with the credentials.
 
@@ -79,6 +94,12 @@ def _get_authentication(self) -> Tuple["Credentials", str]:
             GCPServiceConnector,
         )
 
+        if self.connector_has_expired():
+            self._gcp_credentials = None
+
+        if self._gcp_credentials and self._gcp_project_id:
+            return self._gcp_credentials, self._gcp_project_id
+
         connector = self.get_connector()
         if connector:
             credentials = connector.connect()
@@ -90,6 +111,8 @@ def _get_authentication(self) -> Tuple["Credentials", str]:
                     "trying to use the linked connector, but got "
                     f"{type(credentials)}."
                 )
+            self._gcp_credentials = credentials
+            self._gcp_project_id = connector.config.gcp_project_id
             return credentials, connector.config.gcp_project_id
 
         if self.config.service_account_path:
@@ -111,4 +134,6 @@ def _get_authentication(self) -> Tuple["Credentials", str]:
         # If the project was set in the configuration, use it. Otherwise, use
         # the project that was used to authenticate.
         project_id = self.config.project if self.config.project else project_id
+        self._gcp_credentials = credentials
+        self._gcp_project_id = project_id
         return credentials, project_id
diff --git a/src/zenml/integrations/gcp/orchestrators/vertex_orchestrator.py b/src/zenml/integrations/gcp/orchestrators/vertex_orchestrator.py
@@ -144,6 +144,7 @@ class VertexOrchestrator(ContainerizedOrchestrator, GoogleCredentialsMixin):
     """Orchestrator responsible for running pipelines on Vertex AI."""
 
     _pipeline_root: str
+    _job_service_client: Optional[aiplatform.gapic.JobServiceClient] = None
 
     @property
     def config(self) -> VertexOrchestratorConfig:
@@ -261,6 +262,25 @@ def pipeline_directory(self) -> str:
         """
         return os.path.join(self.root_directory, "pipelines")
 
+    def get_job_service_client(self) -> aiplatform.gapic.JobServiceClient:
+        """Get the job service client.
+
+        Returns:
+            The job service client.
+        """
+        if self.connector_has_expired():
+            self._job_service_client = None
+
+        if self._job_service_client is None:
+            credentials, _ = self._get_authentication()
+            client_options = {
+                "api_endpoint": self.config.location + VERTEX_ENDPOINT_SUFFIX
+            }
+            self._job_service_client = aiplatform.gapic.JobServiceClient(
+                credentials=credentials, client_options=client_options
+            )
+        return self._job_service_client
+
     def _create_container_component(
         self,
         image: str,
@@ -696,34 +716,38 @@ def submit_dynamic_pipeline(
             network=self.config.network,
         )
 
-        credentials, project_id = self._get_authentication()
-        client_options = {
-            "api_endpoint": self.config.location + VERTEX_ENDPOINT_SUFFIX
-        }
-        client = aiplatform.gapic.JobServiceClient(
-            credentials=credentials, client_options=client_options
+        client = self.get_job_service_client()
+        parent = (
+            f"projects/{self.gcp_project_id}/locations/{self.config.location}"
         )
-        parent = f"projects/{project_id}/locations/{self.config.location}"
         job_model = client.create_custom_job(
             parent=parent, custom_job=job_request
         )
 
-        wait_for_completion = None
+        _wait_for_completion = None
         if settings.synchronous:
-            wait_for_completion = lambda: monitor_job(
-                job_id=job_model.name,
-                credentials_source=self,
-                client_options=client_options,
-            )
 
-        self._initialize_vertex_client()
-        job = aiplatform.CustomJob.get(job_model.name)
+            def _wait_for_completion() -> None:
+                logger.info("Waiting for the VertexAI job to finish...")
+                monitor_job(
+                    job_id=job_model.name,
+                    get_client=self.get_job_service_client,
+                )
+                logger.info("VertexAI job completed successfully.")
+
+        credentials, project_id = self._get_authentication()
+        job = aiplatform.CustomJob.get(
+            job_model.name,
+            project=project_id,
+            location=self.config.location,
+            credentials=credentials,
+        )
         metadata = self.compute_metadata(job)
 
         logger.info("View the Vertex job at %s", job._dashboard_uri())
 
         return SubmissionResult(
-            wait_for_completion=wait_for_completion,
+            wait_for_completion=_wait_for_completion,
             metadata=metadata,
         )
 
@@ -765,14 +789,10 @@ def run_isolated_step(
             network=self.config.network,
         )
 
-        credentials, project_id = self._get_authentication()
-        client_options = {
-            "api_endpoint": self.config.location + VERTEX_ENDPOINT_SUFFIX
-        }
-        client = aiplatform.gapic.JobServiceClient(
-            credentials=credentials, client_options=client_options
+        client = self.get_job_service_client()
+        parent = (
+            f"projects/{self.gcp_project_id}/locations/{self.config.location}"
         )
-        parent = f"projects/{project_id}/locations/{self.config.location}"
         logger.info(
             "Submitting custom job='%s', path='%s' to Vertex AI Training.",
             job_request["display_name"],
@@ -781,8 +801,7 @@ def run_isolated_step(
         job = client.create_custom_job(parent=parent, custom_job=job_request)
         monitor_job(
             job_id=job.name,
-            credentials_source=self,
-            client_options=client_options,
+            get_client=self.get_job_service_client,
         )
 
     def _upload_and_run_pipeline(
@@ -1060,15 +1079,6 @@ def _configure_container_resources(
 
         return dynamic_component
 
-    def _initialize_vertex_client(self) -> None:
-        """Initializes the Vertex client."""
-        credentials, project_id = self._get_authentication()
-        aiplatform.init(
-            project=project_id,
-            location=self.config.location,
-            credentials=credentials,
-        )
-
     def fetch_status(
         self, run: "PipelineRunResponse", include_steps: bool = False
     ) -> Tuple[
@@ -1102,8 +1112,6 @@ def fetch_status(
             == run.stack.components[StackComponentType.ORCHESTRATOR][0].id
         )
 
-        self._initialize_vertex_client()
-
         # Fetch the status of the PipelineJob
         if METADATA_ORCHESTRATOR_RUN_ID in run.run_metadata:
             run_id = run.run_metadata[METADATA_ORCHESTRATOR_RUN_ID]
@@ -1115,8 +1123,14 @@ def fetch_status(
                 "the status."
             )
 
+        credentials, project_id = self._get_authentication()
         if run.snapshot and run.snapshot.is_dynamic:
-            status = aiplatform.CustomJob.get(run_id).state
+            status = aiplatform.CustomJob.get(
+                run_id,
+                project=project_id,
+                location=self.config.location,
+                credentials=credentials,
+            ).state
 
             if status in [
                 JobState.JOB_STATE_QUEUED,
@@ -1143,7 +1157,12 @@ def fetch_status(
             else:
                 pipeline_status = run.status
         else:
-            status = aiplatform.PipelineJob.get(run_id).state
+            status = aiplatform.PipelineJob.get(
+                run_id,
+                project=project_id,
+                location=self.config.location,
+                credentials=credentials,
+            ).state
 
             # Map the potential outputs to ZenML ExecutionStatus. Potential values:
             # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker/client/describe_pipeline_execution.html#
diff --git a/src/zenml/integrations/gcp/step_operators/vertex_step_operator.py b/src/zenml/integrations/gcp/step_operators/vertex_step_operator.py
@@ -57,6 +57,8 @@ class VertexStepOperator(BaseStepOperator, GoogleCredentialsMixin):
     ZenML entrypoint command in it.
     """
 
+    _job_service_client: Optional[aiplatform.gapic.JobServiceClient] = None
+
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         """Initializes the step operator and validates the accelerator type.
 
@@ -150,6 +152,25 @@ def get_docker_builds(
 
         return builds
 
+    def get_job_service_client(self) -> aiplatform.gapic.JobServiceClient:
+        """Get the job service client.
+
+        Returns:
+            The job service client.
+        """
+        if self.connector_has_expired():
+            self._job_service_client = None
+
+        if self._job_service_client is None:
+            credentials, _ = self._get_authentication()
+            client_options = {
+                "api_endpoint": self.config.region + VERTEX_ENDPOINT_SUFFIX
+            }
+            self._job_service_client = aiplatform.gapic.JobServiceClient(
+                credentials=credentials, client_options=client_options
+            )
+        return self._job_service_client
+
     def launch(
         self,
         info: "StepRunInfo",
@@ -193,15 +214,10 @@ def launch(
         )
         logger.debug("Vertex AI Job=%s", job_request)
 
-        credentials, project_id = self._get_authentication()
-        client_options = {
-            "api_endpoint": self.config.region + VERTEX_ENDPOINT_SUFFIX
-        }
-        client = aiplatform.gapic.JobServiceClient(
-            credentials=credentials, client_options=client_options
+        client = self.get_job_service_client()
+        parent = (
+            f"projects/{self.gcp_project_id}/locations/{self.config.region}"
         )
-
-        parent = f"projects/{project_id}/locations/{self.config.region}"
         logger.info(
             "Submitting custom job='%s', path='%s' to Vertex AI Training.",
             job_request["display_name"],
@@ -215,6 +231,5 @@ def launch(
 
         monitor_job(
             job_id=response.name,
-            credentials_source=self,
-            client_options=client_options,
+            get_client=self.get_job_service_client,
         )
diff --git a/src/zenml/integrations/gcp/utils.py b/src/zenml/integrations/gcp/utils.py
@@ -14,7 +14,7 @@
 """Vertex utilities."""
 
 import time
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 
 from google.api_core.exceptions import ServerError
 from google.cloud import aiplatform
@@ -25,9 +25,6 @@
     VERTEX_JOB_STATES_COMPLETED,
     VERTEX_JOB_STATES_FAILED,
 )
-from zenml.integrations.gcp.google_credentials_mixin import (
-    GoogleCredentialsMixin,
-)
 from zenml.integrations.gcp.vertex_custom_job_parameters import (
     VertexCustomJobParameters,
 )
@@ -55,59 +52,33 @@ def validate_accelerator_type(accelerator_type: Optional[str] = None) -> None:
         )
 
 
-def get_job_service_client(
-    credentials_source: GoogleCredentialsMixin,
-    client_options: Optional[Dict[str, Any]] = None,
-) -> aiplatform.gapic.JobServiceClient:
-    """Gets a job service client.
-
-    Args:
-        credentials_source: The component that provides the credentials to
-            access the job.
-        client_options: The client options to use for the job service client.
-
-    Returns:
-        A job service client.
-    """
-    credentials, _ = credentials_source._get_authentication()
-    return aiplatform.gapic.JobServiceClient(
-        credentials=credentials, client_options=client_options
-    )
-
-
 def monitor_job(
     job_id: str,
-    credentials_source: GoogleCredentialsMixin,
-    client_options: Optional[Dict[str, Any]] = None,
+    get_client: Callable[[], aiplatform.gapic.JobServiceClient],
 ) -> None:
     """Monitors a job until it is completed.
 
     Args:
         job_id: The ID of the job to monitor.
-        credentials_source: The component that provides the credentials to
-            access the job.
-        client_options: The client options to use for the job service client.
+        get_client: A function that returns an authenticated job service client.
 
     Raises:
         RuntimeError: If the job fails.
     """
     retry_count = 0
-    client = get_job_service_client(
-        credentials_source=credentials_source, client_options=client_options
-    )
+    client = get_client()
 
     while True:
         time.sleep(POLLING_INTERVAL_IN_SECONDS)
-        if credentials_source.connector_has_expired():
-            client = get_job_service_client(
-                credentials_source=credentials_source,
-                client_options=client_options,
-            )
+        # Fetch a fresh client in case the credentials have expired
+        client = get_client()
 
         try:
             response = client.get_custom_job(name=job_id)
             retry_count = 0
         except (ConnectionError, ServerError) as err:
+            # Retry on connection errors, see also
+            # https://github.com/googleapis/google-api-python-client/issues/218
             if retry_count < CONNECTION_ERROR_RETRY_LIMIT:
                 retry_count += 1
                 logger.warning(