diff --git a/lottie/src/main/java/com/airbnb/lottie/utils/FastBlur.java b/lottie/src/main/java/com/airbnb/lottie/utils/FastBlur.java
index e0e03b695a..7590f71df1 100644
--- a/lottie/src/main/java/com/airbnb/lottie/utils/FastBlur.java
+++ b/lottie/src/main/java/com/airbnb/lottie/utils/FastBlur.java
@@ -3,11 +3,31 @@
 import android.graphics.Bitmap;
 import android.graphics.Rect;
 
-import java.nio.ByteBuffer;
 import java.nio.IntBuffer;
 
+/**
+ * A reasonably fast box blur implementation on the CPU.
+ * <p>
+ * Used as a fallback to approximate Gausisan blur when the RenderEffect
+ * API is not available, and the file requests that a precomp or image
+ * layer are blurred.
+ * <p>
+ * The blur works by keeping a moving average of the last k pixels and
+ * writing that average to the output. It performs a horizontal then
+ * vertical pass. This gives it a complexity of O(wh) --- in particular,
+ * it iterates through all pixels twice. Anything asymptotically slower
+ * than this would be infeasible to run on a device.
+ * <p>
+ * As an additional optimization, we try to skip runs of the same color
+ * and avoid writing to the output for these pixels. Lotties often
+ * have large areas of single-color fills, so this helps performance
+ * in these cases.
+ * <p>
+ * The runtime seems to be sensitive to function calls and does not
+ * appear to inline them, so some parts of functions are repeated instead
+ * of abstracted, for performance.
+ */
 public class FastBlur {
-
   private IntBuffer buf1;
   private IntBuffer buf2;
 
@@ -25,6 +45,11 @@ private void ensureCapacity(Bitmap image) {
     buf2.rewind();
   }
 
+  /**
+   * Initializes the accumulator for the given row of the image. The left
+   * half of the kernel is all clamped, which is why we first accumulate
+   * the leftmost pixel radius + 1 times.
+   */
   private void initialAccumulateHorizontal(int[] src, int[] sumsByChannel, int rowStart, int radius) {
     int startVal = src[rowStart];
     int startValR = startVal & 0xff;
@@ -46,6 +71,12 @@ private void initialAccumulateHorizontal(int[] src, int[] sumsByChannel, int row
     }
   }
 
+  /**
+   * A simple, straightforward implementation of a horizontal pass. It
+   * performs clamping on every iteration. It's used when the width of
+   * the image is smaller than the kernel size. Otherwise, horizontalPass()
+   * is used, which skips clamping whenever safe and is slightly faster.
+   */
   private void naiveHorizontalPass(int[] src, int[] dst, int stride, Rect rect, int radius) {
     int kernelSize = 2 * radius + 1;
     int[] sumsByChannel = new int[4];
@@ -75,6 +106,7 @@ private void naiveHorizontalPass(int[] src, int[] dst, int stride, Rect rect, in
                 (sumsByChannel[3] / kernelSize << 24)
         );
 
+        // Quickly skip single-color areas
         while (dst[base] == newVal &&
             x < width - radius - 1 &&
             src[baseLeft] == src[baseRight]) {
@@ -97,6 +129,10 @@ private void naiveHorizontalPass(int[] src, int[] dst, int stride, Rect rect, in
     }
   }
 
+  /**
+   * Single horizontal pass: blurs the image in src[] with the radius horizontally
+   * and writes the result to dst[].
+   */
   private void horizontalPass(int[] src, int[] dst, int stride, Rect rect, int radius) {
     int kernelSize = 2 * radius + 1;
     int[] sumsByChannel = new int[4];
@@ -151,6 +187,7 @@ private void horizontalPass(int[] src, int[] dst, int stride, Rect rect, int rad
                 (sumsByChannel[3] / kernelSize << 24)
         );
 
+        // Quickly skip single-color areas
         while (dst[base] == newVal &&
             x < width - radius - 1 &&
             src[baseLeft] == src[baseRight]) {
@@ -200,6 +237,11 @@ private void horizontalPass(int[] src, int[] dst, int stride, Rect rect, int rad
     }
   }
 
+  /**
+   * Initializes the accumulator for the given column of the image. The top
+   * half of the kernel is all clamped, which is why we first accumulate
+   * the topmost pixel radius + 1 times.
+   */
   private void initialAccumulateVertical(int[] src, int[] sumsByChannel, int columnStart, int stride, int radius) {
     int startVal = src[columnStart];
     int startValR = startVal & 0xff;
@@ -221,7 +263,11 @@ private void initialAccumulateVertical(int[] src, int[] sumsByChannel, int colum
     }
   }
 
-  private void verticalPass(int[] src, int[] dst, int stride, Rect rect, int radius) {
+  /**
+   * A straightforward implementation of a vertical blur pass. See
+   * naiveHorizontalPass() for more info.
+   */
+  private void naiveVerticalPass(int[] src, int[] dst, int stride, Rect rect, int radius) {
     int kernelSize = 2 * radius + 1;
     int[] sumsByChannel = new int[4];
 
@@ -238,12 +284,11 @@ private void verticalPass(int[] src, int[] dst, int stride, Rect rect, int radiu
       int topPixelOffset = (-radius) * stride;
       int bottomPixelOffset = (radius + 1) * stride;
 
-      // Y is clamped to the top
       int y = 0;
-      while (y < radius) {
+      while (y < height) {
         int base = columnStart + stride * y;
-        int baseTop = columnStart;
-        int baseBottom = base + bottomPixelOffset;
+        int baseTop = Math.max(base + topPixelOffset, columnStart);
+        int baseBottom = Math.min(base + bottomPixelOffset, lastPixel);
 
         int newVal = (
             (sumsByChannel[0] / kernelSize) |
@@ -252,6 +297,7 @@ private void verticalPass(int[] src, int[] dst, int stride, Rect rect, int radiu
                 (sumsByChannel[3] / kernelSize << 24)
         );
 
+        // Quickly skip single-color areas
         while (dst[base] == newVal &&
             y < height - radius - 1 &&
             src[baseTop] == src[baseBottom]) {
@@ -262,6 +308,7 @@ private void verticalPass(int[] src, int[] dst, int stride, Rect rect, int radiu
         }
 
         dst[base] = newVal;
+
         int top = src[baseTop];
         int bottom = src[baseBottom];
 
@@ -272,20 +319,43 @@ private void verticalPass(int[] src, int[] dst, int stride, Rect rect, int radiu
 
         y++;
       }
+    }
+  }
 
-      // Y is not clamped
-      while (y < height - radius - 1) {
+  /**
+   * Single vertical pass: blurs the image in src[] with the radius vertically
+   * and writes the result to dst[].
+   */
+  private void verticalPass(int[] src, int[] dst, int stride, Rect rect, int radius) {
+    int kernelSize = 2 * radius + 1;
+    int[] sumsByChannel = new int[4];
+
+    int firstPixel = stride * rect.top + rect.left;
+    int width = rect.width();
+    int height = rect.height();
+    for (int x = 0; x < width; x++) {
+      int columnStart = firstPixel + x;
+      int lastPixel = columnStart + stride * (height - 1);
+
+      initialAccumulateVertical(src, sumsByChannel, columnStart, stride, radius);
+
+      int topPixelOffset = (-radius) * stride;
+      int bottomPixelOffset = (radius + 1) * stride;
+
+      int y = 0;
+      while (y < radius) {
         int base = columnStart + stride * y;
-        int baseTop = base + topPixelOffset;
+        int baseTop = columnStart;
         int baseBottom = base + bottomPixelOffset;
 
-        dst[base] = (
+        int newVal = (
             (sumsByChannel[0] / kernelSize) |
                 (sumsByChannel[1] / kernelSize << 8) |
                 (sumsByChannel[2] / kernelSize << 16) |
                 (sumsByChannel[3] / kernelSize << 24)
         );
 
+        dst[base] = newVal;
         int top = src[baseTop];
         int bottom = src[baseBottom];
 
@@ -297,19 +367,31 @@ private void verticalPass(int[] src, int[] dst, int stride, Rect rect, int radiu
         y++;
       }
 
-      // Y is clamped to the bottom
-      while (y < height) {
+      // Y is not clamped
+      while (y < height - radius - 1) {
         int base = columnStart + stride * y;
         int baseTop = base + topPixelOffset;
-        int baseBottom = lastPixel;
+        int baseBottom = base + bottomPixelOffset;
 
-        dst[base] = (
+        int newVal = (
             (sumsByChannel[0] / kernelSize) |
                 (sumsByChannel[1] / kernelSize << 8) |
                 (sumsByChannel[2] / kernelSize << 16) |
                 (sumsByChannel[3] / kernelSize << 24)
         );
 
+        // Quickly skip single-color areas
+        while (dst[base] == newVal &&
+            y < height - radius - 1 &&
+            src[baseTop] == src[baseBottom]) {
+          y++;
+          base += stride;
+          baseTop += stride;
+          baseBottom += stride;
+        }
+
+        dst[base] = newVal;
+
         int top = src[baseTop];
         int bottom = src[baseBottom];
 
@@ -320,50 +402,20 @@ private void verticalPass(int[] src, int[] dst, int stride, Rect rect, int radiu
 
         y++;
       }
-    }
-  }
-
-  private void naiveVerticalPass(int[] src, int[] dst, int stride, Rect rect, int radius) {
-    int kernelSize = 2 * radius + 1;
-    int[] sumsByChannel = new int[4];
-
-    int firstPixel = stride * rect.top + rect.left;
-    int width = rect.width();
-    int height = rect.height();
-    for (int x = 0; x < width; x++) {
-      // Init with the first element only
-      int columnStart = firstPixel + x;
-      int lastPixel = columnStart + stride * (height - 1);
 
-      initialAccumulateVertical(src, sumsByChannel, columnStart, stride, radius);
-
-      int topPixelOffset = (-radius) * stride;
-      int bottomPixelOffset = (radius + 1) * stride;
-
-      int y = 0;
+      // Y is clamped to the bottom
       while (y < height) {
         int base = columnStart + stride * y;
-        int baseTop = Math.max(base + topPixelOffset, columnStart);
-        int baseBottom = Math.min(base + bottomPixelOffset, lastPixel);
+        int baseTop = base + topPixelOffset;
+        int baseBottom = lastPixel;
 
-        int newVal = (
+        dst[base] = (
             (sumsByChannel[0] / kernelSize) |
                 (sumsByChannel[1] / kernelSize << 8) |
                 (sumsByChannel[2] / kernelSize << 16) |
                 (sumsByChannel[3] / kernelSize << 24)
         );
 
-        while (dst[base] == newVal &&
-            y < height - radius - 1 &&
-            src[baseTop] == src[baseBottom]) {
-          y++;
-          base += stride;
-          baseTop += stride;
-          baseBottom += stride;
-        }
-
-        dst[base] = newVal;
-
         int top = src[baseTop];
         int bottom = src[baseBottom];
 
@@ -377,23 +429,37 @@ private void naiveVerticalPass(int[] src, int[] dst, int stride, Rect rect, int
     }
   }
 
-  void blurPass(int[] src, int[] dst, int byteStride, Rect rect, int radius) {
+  /**
+   * Performs a single horizontal + vertical blur pass on the pixels
+   * in pixelsInOut, overwriting them. Needs a same-size scratch
+   * buffer, as it performs two passes internally.
+   */
+  void blurPass(int[] pixelsInOut, int[] scratch, int byteStride, Rect rect, int radius) {
     int kernelSize = 2 * radius - 1;
     int stride = byteStride / 4;
 
     if (rect.width() >= kernelSize) {
-      horizontalPass(src, dst, stride, rect, radius);
+      horizontalPass(pixelsInOut, scratch, stride, rect, radius);
     } else {
-      naiveHorizontalPass(src, dst, stride, rect, radius);
+      naiveHorizontalPass(pixelsInOut, scratch, stride, rect, radius);
     }
 
     if (rect.height() >= kernelSize) {
-      verticalPass(dst, src, stride, rect, radius);
+      verticalPass(scratch, pixelsInOut, stride, rect, radius);
     } else {
-      naiveVerticalPass(dst, src, stride, rect, radius);
+      naiveVerticalPass(scratch, pixelsInOut, stride, rect, radius);
     }
   }
 
+  /**
+   * Performs a box blur on a rectangular region defined by the provided
+   * Rect of the provided Bitmap.
+   * <p>
+   * The kernel is a uniform kernel where k_i = 1 / n. Note that this
+   * results in a more pronounced, anisotropic blur than a true Gaussian
+   * blur. However, performing a true Gaussian blur is prohibitively
+   * slow.
+   */
   public void applyBlur(Bitmap image, int radius, Rect rect) {
     if (radius < 1) {
       return;
@@ -408,7 +474,12 @@ public void applyBlur(Bitmap image, int radius, Rect rect) {
     int stride = image.getRowBytes();
     blurPass(buf1.array(), buf2.array(), stride, rect, radius);
 
-    //blurPass(buf1.array(), buf2.array(), stride, rect, radius / 2);
+    // An alternative here is to apply the blur more than once with
+    // smaller kernels. The more box blurs are stacked on top of each
+    // other, the more the final result resembles a true Gaussian blur.
+    //
+    // However, this is too slow in the general case, so we would be left
+    // with only heuristics, which is tricky to get right.
 
     image.copyPixelsFromBuffer(buf1);
   }
diff --git a/lottie/src/main/java/com/airbnb/lottie/utils/OffscreenLayer.java b/lottie/src/main/java/com/airbnb/lottie/utils/OffscreenLayer.java
index abd7fea06f..aa4e5809c7 100644
--- a/lottie/src/main/java/com/airbnb/lottie/utils/OffscreenLayer.java
+++ b/lottie/src/main/java/com/airbnb/lottie/utils/OffscreenLayer.java
@@ -46,6 +46,7 @@ public static class ComposeOp {
     @Nullable public BlendModeCompat blendMode;
     @Nullable public ColorFilter colorFilter;
     @Nullable public DropShadow shadow;
+    /** Blur to apply as returned by BlurKeyframeAnimation::evaluate. */
     public float blur;
 
     public ComposeOp() {
@@ -167,7 +168,7 @@ private RenderStrategy chooseRenderStrategy(Canvas parentCanvas, ComposeOp op) {
 
     // Beyond this point, we are sure that we need to render a drop shadow or blur.
 
-    if (Build.VERSION.SDK_INT < Build.VERSION_CODES.Q || true) { // { !parentCanvas.isHardwareAccelerated()) {
+    if (Build.VERSION.SDK_INT < Build.VERSION_CODES.Q || !parentCanvas.isHardwareAccelerated()) {
       // We don't have support for the RenderNode API, or we're rendering to a software canvas
       // which doesn't support RenderNodes anyhow. This is the slowest path: render to a bitmap,
       // add a shadow/blur manually on CPU.
@@ -506,7 +507,7 @@ private void renderBitmapShadow(Canvas targetCanvas, DropShadow shadow) {
     // Draw the image onto the mask layer first. Since the mask layer is ALPHA_8, this discards color information.
     // Align it so that when drawn in the end, it originates at targetRect.x, targetRect.y
     // the int casts are very important here - they save us from some slow path for non-integer coords
-    shadowMaskBitmapCanvas.drawBitmap(bitmap, (int)Math.round(offsetX * pixelScaleX), (int)Math.round(offsetY * pixelScaleY), null);
+    shadowMaskBitmapCanvas.drawBitmap(bitmap, Math.round(offsetX * pixelScaleX), Math.round(offsetY * pixelScaleY), null);
 
     // Prepare the shadow paint. This is the paint that will perform a blur and a tint of the mask
     if (shadowBlurFilter == null || lastShadowBlurRadius != shadow.getRadius()) {