Also force tiled VAE encode where possible #1658

Acly · Acly · commit 9c8dfc0acead · 2025-03-21T12:49:25.000+01:00
diff --git a/ai_diffusion/comfy_workflow.py b/ai_diffusion/comfy_workflow.py
@@ -751,6 +751,9 @@ def vae_encode(self, vae: Output, image: Output):
     def vae_encode_inpaint(self, vae: Output, image: Output, mask: Output):
         return self.add("VAEEncodeForInpaint", 1, vae=vae, pixels=image, mask=mask, grow_mask_by=0)
 
+    def vae_encode_tiled(self, vae: Output, image: Output):
+        return self.add("VAEEncodeTiled", 1, vae=vae, pixels=image, tile_size=512, overlap=64)
+
     def vae_decode(self, vae: Output, latent_image: Output):
         return self.add("VAEDecode", 1, vae=vae, samples=latent_image)
 
diff --git a/ai_diffusion/workflow.py b/ai_diffusion/workflow.py
@@ -156,6 +156,12 @@ def load_checkpoint_with_lora(w: ComfyWorkflow, checkpoint: CheckpointInput, mod
     return model, Clip(clip, arch), vae
 
 
+def vae_encode(w: ComfyWorkflow, vae: Output, image: Output, tiled: bool):
+    if tiled:
+        return w.vae_encode_tiled(vae, image)
+    return w.vae_encode(vae, image)
+
+
 def vae_decode(w: ComfyWorkflow, vae: Output, latent: Output, tiled: bool):
     if tiled:
         return w.vae_decode_tiled(vae, latent)
@@ -678,7 +684,7 @@ def scale_refine_and_decode(
     decoded = vae_decode(w, vae, latent, tiled_vae)
     upscale = w.upscale_image(upscale_model, decoded)
     upscale = w.scale_image(upscale, extent.desired)
-    latent = w.vae_encode(vae, upscale)
+    latent = vae_encode(w, vae, upscale, tiled_vae)
     params = _sampler_params(sampling, strength=0.4)
 
     positive, negative = encode_text_prompt(w, cond, clip, regions)
@@ -874,7 +880,7 @@ def inpaint(
         )
         inpaint_model = model
     else:
-        latent = w.vae_encode(vae, in_image)
+        latent = vae_encode(w, vae, in_image, checkpoint.tiled_vae)
         latent = w.set_latent_noise_mask(latent, initial_mask)
         inpaint_model = model
 
@@ -899,7 +905,7 @@ def inpaint(
         upscale = ensure_minimum_extent(w, upscale, initial_bounds.extent, 32)
         upscale = w.upscale_image(upscale_model, upscale)
         upscale = w.scale_image(upscale, upscale_extent.desired)
-        latent = w.vae_encode(vae, upscale)
+        latent = vae_encode(w, vae, upscale, checkpoint.tiled_vae)
         latent = w.set_latent_noise_mask(latent, upscale_mask)
 
         cond_upscale = cond.copy()
@@ -956,7 +962,7 @@ def refine(
     model = apply_regional_ip_adapter(w, model, cond.regions, extent.initial, models)
     in_image = w.load_image(image)
     in_image = scale_to_initial(extent, w, in_image, models)
-    latent = w.vae_encode(vae, in_image)
+    latent = vae_encode(w, vae, in_image, checkpoint.tiled_vae)
     latent = w.batch_latent(latent, misc.batch_count)
     positive, negative = encode_text_prompt(w, cond, clip, regions)
     model, positive, negative = apply_control(
@@ -1010,7 +1016,7 @@ def refine_region(
         inpaint_patch = w.load_fooocus_inpaint(**models.fooocus_inpaint)
         inpaint_model = w.apply_fooocus_inpaint(model, inpaint_patch, latent_inpaint)
     else:
-        latent = w.vae_encode(vae, in_image)
+        latent = vae_encode(w, vae, in_image, checkpoint.tiled_vae)
         latent = w.set_latent_noise_mask(latent, initial_mask)
         inpaint_model = model
 
@@ -1179,7 +1185,7 @@ def tiled_region(region: Region, index: int, tile_bounds: Bounds):
             w, tile_model, positive, negative, control, no_reshape, vae, models
         )
 
-        latent = w.vae_encode(vae, tile_image)
+        latent = vae_encode(w, vae, tile_image, checkpoint.tiled_vae)
         latent = w.set_latent_noise_mask(latent, tile_mask)
         sampler = w.sampler_custom_advanced(
             tile_model, positive, negative, latent, models.arch, **_sampler_params(sampling)