[ADD] einsum expression for transpose convolution's KFAC-reduce (#45)

* [ADD] Utility function to compute input sizes of a convolution * [ADD] Einsum expression and functional for transpose input unfolding * [DEL] Print statements * [DOC] Polish docstrings * [DOC] Minor polish * [FIX] Too long lines * [ADD] einsum expression for transpose convolution's KFAC-reduce * [FIX] Long line * [REF] Minor polish * [FIX] flake8
f-dangel · Jun 6, 2024 · 8cc761e · 8cc761e
1 parent acb65e2
commit 8cc761e
Show file tree

Hide file tree

Showing 10 changed files with 459 additions and 9 deletions.
diff --git a/docs/api/expressions.md b/docs/api/expressions.md
@@ -5,3 +5,4 @@
 :::einconv.expressions.convNd_kfc
 :::einconv.expressions.convNd_kfac_reduce
 :::einconv.expressions.conv_transposeNd_unfold
+:::einconv.expressions.conv_transposeNd_kfac_reduce
diff --git a/einconv/expressions/convNd_kfac_reduce.py b/einconv/expressions/convNd_kfac_reduce.py
@@ -2,8 +2,10 @@
 
 KFAC-reduce was introduced by:
 
-- Eschenhagen, R. (2022). Kronecker-factored approximate curvature for linear
- weight-sharing layers, Master thesis.
+- [Eschenhagen, R., Immer, A., Turner, R. E., Schneider, F., & Hennig, P.
+ (2023). Kronecker-factored approximate curvature for modern neural network
+ architectures. In Advances in Neural Information Processing Systems (NeurIPS)]\
+(https://arxiv.org/abs/2311.00636).
 """
 
 from typing import List, Tuple, Union
@@ -27,6 +29,26 @@ def einsum_expression(
 ) -> Tuple[str, List[Tensor], Tuple[int, ...]]:
  """Generate einsum expression of input-based KFAC-reduce factor for convolution.
 
+ Let $\\mathbf{X}\\in\\mathbb{R}^{C_\\text{in}\\times I_1\\times I_2\\times\\dots}$
+ denote the input of a convolution. The unfolded input $[[\\mathbf{X}]]$
+ has dimension $(C_\\text{in} \\cdot K_1 \\cdot K_2 \\cdots) \\times (O_1 \\cdot O_2
+ \\cdots)$ where $K_i$ and $O_i$ are the kernel and output sizes of the convolution.
+ The input-based KFAC-reduce factor is the batch-averaged outer product
+ of the column-averaged unfolded input,
+
+ $$
+ \\hat{\\mathbf{\\Omega}} =
+ \\frac{1}{B \\cdot (O_1 \\cdot O_2 \\cdots)^2} \\sum_{b=1}^B
+ ( [[\\mathbf{X}_b]]^\\top \\mathbf{1} )
+ ( [[\\mathbf{X}_b]]^\\top \\mathbf{1} )^\\top
+ \\in \\mathbb{R}^{(C_\\text{in} \\cdot K_1 \\cdot K_2 \\cdots) \\times
+ (C_\\text{in} \\cdot K_1 \\cdot K_2 \\cdots)}
+ \\,,
+ $$
+
+ where $B$ is the batch size and $\\mathbf{X}_b$ is the convolution's input from the
+ $b$th data point.
+
  Args:
  x: Convolution input. Has shape ``[batch_size, in_channels, *input_sizes]``
  where ``len(input_sizes) == N``.
@@ -46,8 +68,8 @@ def einsum_expression(
  Einsum equation
  Einsum operands in order un-grouped input, patterns, un-grouped input, \
  patterns, normalization scaling
- Output shape: ``[groups, in_channels //groups * tot_kernel_sizes,\
- in_channels //groups * tot_kernel_sizes]``
+ Output shape: ``[groups, in_channels // groups * tot_kernel_sizes,\
+ in_channels // groups * tot_kernel_sizes]``
  """
  N = x.dim() - 2
 
@@ -83,7 +105,7 @@ def einsum_expression(
  x_ungrouped = rearrange(x, "n (g c_in) ... -> n g c_in ...", g=groups)
  output_tot_size = Tensor([p.shape[1] for p in patterns]).int().prod()
  batch_size = x.shape[0]
- scale = Tensor([1.0 / (batch_size * output_tot_size**2)]).to(x.device).to(x.dtype)
+ scale = Tensor([1.0 / (batch_size * output_tot_size**2)]).to(x.device, x.dtype)
  operands = [x_ungrouped, *patterns, *patterns, x_ungrouped, scale]
 
  # construct output shape

diff --git a/einconv/expressions/convNd_kfc.py b/einconv/expressions/convNd_kfc.py
@@ -2,8 +2,9 @@
 
 KFC was introduced by:
 
-- Grosse, R., & Martens, J. (2016). A Kronecker-factored approximate Fisher matrix
- for convolution layers. International Conference on Machine Learning (ICML).
+- [Grosse, R., & Martens, J. (2016). A Kronecker-factored approximate Fisher matrix
+ for convolution layers. International Conference on Machine Learning (ICML).]\
+(https://arxiv.org/abs/1602.01407)
 """
 
 from typing import List, Tuple, Union
@@ -27,6 +28,25 @@ def einsum_expression(
 ) -> Tuple[str, List[Tensor], Tuple[int, ...]]:
  """Generate einsum expression of input-based KFC factor for convolution.
 
+ Let $\\mathbf{X}\\in\\mathbb{R}^{C_\\text{in}\\times I_1\\times I_2\\times\\dots}$
+ denote the input of a convolution. The unfolded input $[[\\mathbf{X}]]$
+ has dimension $(C_\\text{in} \\cdot K_1 \\cdot K_2 \\cdots) \\times (O_1 \\cdot O_2
+ \\cdots)$ where $K_i$ and $O_i$ are the kernel and output sizes of the convolution.
+ The input-based KFC factor is the batch-averaged outer product of the unfolded
+ input,
+
+ $$
+ \\mathbf{\\Omega} =
+ \\frac{1}{B} \\sum_{b=1}^B
+ [[\\mathbf{X}_b]] [[\\mathbf{X}_b]]^\\top
+ \\in \\mathbb{R}^{(C_\\text{in} \\cdot K_1 \\cdot K_2 \\cdots) \\times
+ (C_\\text{in} \\cdot K_1 \\cdot K_2 \\cdots)}
+ \\,,
+ $$
+
+ where $B$ is the batch size and $\\mathbf{X}_b$ is the convolution's input from the
+ $b$th data point.
+
  Args:
  x: Convolution input. Has shape ``[batch_size, in_channels, *input_sizes]``
  where ``len(input_sizes) == N``.

diff --git a/einconv/expressions/conv_transposeNd_kfac_reduce.py b/einconv/expressions/conv_transposeNd_kfac_reduce.py
@@ -0,0 +1,173 @@
+"""Input-based factor of the KFAC-reduce approximation for transpose convolutions.
+
+KFAC-reduce was introduced by:
+
+- [Eschenhagen, R., Immer, A., Turner, R. E., Schneider, F., & Hennig, P.
+ (2023). Kronecker-factored approximate curvature for modern neural network
+ architectures. In Advances in Neural Information Processing Systems (NeurIPS)]\
+(https://arxiv.org/abs/2311.00636).
+"""
+
+from typing import List, Optional, Tuple, Union
+
+from einops import rearrange
+from torch import Tensor
+
+import einconv
+from einconv.expressions.utils import create_conv_index_patterns, translate_to_torch
+from einconv.utils import _tuple, get_conv_input_size
+
+
+def einsum_expression(
+ x: Tensor,
+ kernel_size: Union[int, Tuple[int, ...]],
+ stride: Union[int, Tuple[int, ...]] = 1,
+ padding: Union[int, Tuple[int, ...]] = 0,
+ output_padding: Union[int, Tuple[int, ...]] = 0,
+ output_size: Optional[Union[int, Tuple[int, ...]]] = None,
+ dilation: Union[int, Tuple[int, ...]] = 1,
+ groups: int = 1,
+ simplify: bool = True,
+) -> Tuple[str, List[Tensor], Tuple[int, ...]]:
+ """Generate einsum expr. of input-based KFAC-reduce factor for transp. convolution.
+
+ We describe the `N`d transpose convolution using its associated `N`d convolution
+ which maps an input of shape `[batch_size, in_channels, *input_sizes]` to an output
+ of shape `[batch_size, out_channels, *output_sizes]`. The transpose convolution's
+ input has shape `[batch_size, out_channels, *output_sizes]` and the output has shape
+ `[batch_size, in_channels, *input_sizes]`.
+
+ Let $\\mathbf{X}\\in\\mathbb{R}^{C_\\text{out}\\times O_1\\times O_2\\times\\dots}$
+ denote the input of a transpose convolution. The unfolded input $[[\\mathbf{X}
+ ]]_\\top$ has dimension $(C_\\text{out} \\cdot K_1 \\cdot K_2 \\cdots) \\times
+ (I_1 \\cdot I_2 \\cdots)$ where $K_i$ and $I_i$ are the kernel and input sizes of
+ the associated convolution. The input-based KFAC-reduce factor is the batch-averaged
+ outer product of the column-averaged unfolded input,
+
+ $$
+ \\hat{\\mathbf{\\Omega}} =
+ \\frac{1}{B \\cdot (I_1 \\cdot I_2 \\cdots)^2} \\sum_{b=1}^B
+ ( [[\\mathbf{X}_b]]^\\top_\\top \\mathbf{1} )
+ ( [[\\mathbf{X}_b]]^\\top_\\top \\mathbf{1} )^\\top
+ \\in \\mathbb{R}^{(C_\\text{out} \\cdot K_1 \\cdot K_2 \\cdots) \\times
+ (C_\\text{out} \\cdot K_1 \\cdot K_2 \\cdots)}
+ \\,,
+ $$
+
+ where $B$ is the batch size and $\\mathbf{X}_b$ is the transpose convolution's
+ input from the $b$th data point.
+
+ Args:
+ x: Input tensor of shape `[batch_size, out_channels, *output_sizes]`.
+ kernel_size: Size of the convolutional kernel. Can be a single integer (shared
+ along all spatial dimensions), or an `N`-tuple of integers.
+ stride: Stride of the associated convolution. Can be a single integer (shared
+ along all spatial dimensions), or an `N`-tuple of integers. Default: `1`.
+ padding: Padding of the associated convolution. Can be a single integer (shared
+ along all spatial dimensions), or an `N`-tuple of integers. Default: `0`.
+ output_padding: Number of unused pixels at the end of the spatial domain.
+ This is used to resolve the ambiguity that a convolution can map different
+ input sizes to the same output size if its stride is different from 1.
+ Instead of specifying this argument, you can directly specify the output
+ size of the transpose convolution (i.e. the input size of the associated
+ convolution via the `output_size` argument). Can be a single integer
+ (shared along all spatial dimensions), or an `N`-tuple. Default: `0`.
+ output_size: Size of the output of the transpose convolution (i.e. the input
+ size of the associated convolution). Specifying this argument will override
+ the `output_padding` argument. Can be a single integer (shared along all
+ spatial dimensions), or an `N`-tuple of integers. Default: `None`.
+ dilation: Dilation of the convolution. Can be a single integer (shared along
+ all spatial dimensions), or an `N`-tuple of integers. Default: `1`.
+ groups: In how many groups to split the channels. Default: `1`.
+ simplify: Whether to simplify the einsum expression. Default: `True`.
+
+ Returns:
+ Einsum equation
+ Einsum operands in order un-grouped input, patterns, un-grouped input, \
+ patterns, normalization scaling
+ Output shape: `[groups, out_channels // groups * tot_kernel_sizes,\
+ out_channels // groups * tot_kernel_sizes]`
+ """
+ N = x.dim() - 2
+
+ # construct einsum equation
+ x1_str = "n g c_out " + " ".join([f"o{i}" for i in range(N)])
+ x2_str = "n g c_out_ " + " ".join([f"o{i}_" for i in range(N)])
+ pattern1_strs: List[str] = [f"k{i} o{i} i{i}" for i in range(N)]
+ pattern2_strs: List[str] = [f"k{i}_ o{i}_ i{i}_" for i in range(N)]
+ scale_str = "s"
+ lhs = ",".join([x1_str, *pattern1_strs, *pattern2_strs, x2_str, scale_str])
+ rhs = (
+ "g c_out "
+ + " ".join([f"k{i}" for i in range(N)])
+ + " c_out_ "
+ + " ".join([f"k{i}_" for i in range(N)])
+ )
+ equation = "->".join([lhs, rhs])
+ equation = translate_to_torch(equation)
+
+ conv_output_size = x.shape[2:]
+ t_kernel_size = _tuple(kernel_size, N)
+ t_stride = _tuple(stride, N)
+ t_padding = _tuple(padding, N)
+ t_dilation = _tuple(dilation, N)
+
+ # infer output_padding from convolution's input size
+ if output_size is not None:
+ t_output_size = _tuple(output_size, N)
+ t_output_padding = tuple(
+ output_size - get_conv_input_size(out, K, S, P, 0, D)
+ for output_size, out, K, S, P, D in zip(
+ t_output_size,
+ conv_output_size,
+ t_kernel_size,
+ t_stride,
+ t_padding,
+ t_dilation,
+ )
+ )
+ else:
+ t_output_padding = _tuple(output_padding, N)
+
+ conv_input_size = tuple(
+ get_conv_input_size(out, K, S, P, output_padding, D)
+ for out, K, S, P, output_padding, D in zip(
+ conv_output_size,
+ t_kernel_size,
+ t_stride,
+ t_padding,
+ t_output_padding,
+ t_dilation,
+ )
+ )
+
+ # construct einsum operands
+ patterns = create_conv_index_patterns(
+ N,
+ input_size=conv_input_size,
+ kernel_size=t_kernel_size,
+ stride=t_stride,
+ padding=t_padding,
+ dilation=dilation,
+ device=x.device,
+ dtype=x.dtype,
+ )
+ x_ungrouped = rearrange(x, "n (g c_in) ... -> n g c_in ...", g=groups)
+ conv_input_tot_size = Tensor(conv_input_size).int().prod()
+ batch_size, out_channels = x.shape[:2]
+ scale = Tensor([1.0 / (batch_size * conv_input_tot_size**2)]).to(x.device, x.dtype)
+ operands = [x_ungrouped, *patterns, *patterns, x_ungrouped, scale]
+
+ # construct output shape
+ t_kernel_size = _tuple(kernel_size, N)
+ kernel_tot_size = int(Tensor(t_kernel_size).int().prod())
+ shape = (
+ groups,
+ out_channels // groups * kernel_tot_size,
+ out_channels // groups * kernel_tot_size,
+ )
+
+ if simplify:
+ equation, operands = einconv.simplify(equation, operands)
+
+ return equation, operands, shape
diff --git a/einconv/expressions/conv_transposeNd_unfold.py b/einconv/expressions/conv_transposeNd_unfold.py
@@ -30,6 +30,27 @@ def einsum_expression(
  has shape `[batch_size, out_channels, *output_sizes]` and the output has shape
  `[batch_size, in_channels, *input_sizes]`.
 
+ Let $\\mathbf{X}\\in\\mathbb{R}^{C_\\text{out}\\times O_1\\times O_2\\times\\dots}$
+ denote the input of a transpose convolution, $\\mathbf{W} \\in \\mathbb{R}^{
+ C_\\text{out} \\times C_\\text{in} \\times K_1\\times K_2\\times\\dots}$ its kernel
+ and $\\mathbf{Y}\\in\\mathbb{R}^{C_\\text{in}\\times I_1\\times I_2\\times\\dots}$
+ its output. The unfolded input $[[\\mathbf{X}]]_\\top$ has dimension
+ $(C_\\text{out} \\cdot K_1 \\cdot K_2 \\cdots) \\times (I_1 \\cdot I_2 \\cdots)$ and
+ can be used to express transpose convolution as matrix multiplication,
+
+ $$
+ \\mathrm{mat}(\\mathbf{Y})
+ =
+ \\mathrm{mat}(\\mathbf{W})
+ [[\\mathbf{X})]]_\\top
+ \\,,
+ $$
+
+ where $\\mathrm{mat}(\\mathbf{Y}) \\in \\mathbb{R}^{C_\\text{in}\\times (I_1\\cdot
+ I_2 \\cdots)}$ and $\\mathrm{mat}(\\mathbf{W}) \\in \\mathbb{R}^{C_\\text{in}\\times
+ (C_\\text{out} \\cdot K_1\\cdot K_2 \\cdots)}$ are matrix views of $\\mathbf{Y},
+ \\mathbf{W}$ (note that $\\mathbf{W}$ must also be transposed before matricizing).
+
  Args:
  x: Input to the `N`d transpose convolution. Has shape
  `[batch_size, in_channels, *input_sizes]` where `len(input_sizes) == N`.

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -1,8 +1,10 @@
 site_name: Einconv
-site_url: https://example.com # TODO Fill in the link from the hosting platform
+site_url: https://einconv.readthedocs.io
 repo_url: https://github.com/f-dangel/einconv/
 repo_name: f-dangel/einconv
 site_author: Felix Dangel
+watch:
+ - einconv
 nav:
  - Getting Started: index.md
  - Tutorials: