add BlockRecurrent

zhaozewang · zhaozewang · commit 1ff905f3cdce · 2025-01-19T20:30:42.000-05:00
diff --git a/nn4n/nn/__init__.py b/nn4n/nn/__init__.py
@@ -1,5 +1,6 @@
 from .linear_layer import LinearLayer
 from .leaky_linear_layer import LeakyLinearLayer
 from .recurrent_layer import RecurrentLayer
-from .rnn_layer import RNNLayer
-from .module import Module
+from .rnn import RNN, BlockRNN
+from .module import Module
+from .block_recurrent_layer import BlockRecurrentLayer
diff --git a/nn4n/nn/block_recurrent_layer.py b/nn4n/nn/block_recurrent_layer.py
@@ -1,146 +1,156 @@
 import torch
-from typing import List
-from .linear_layer import LinearLayer
+from typing import List, Tuple
+from .recurrent_layer import RecurrentLayer
 
-
-class BlockMatrix:
+class BlockMatrix(torch.nn.Module):
     def __init__(self, n_blocks: int):
-        """
-        Initializes a BlockMatrix.
-        
-        Args:
-            n (int): Size of the matrix (n x n).
-        """
+        super().__init__()
         self.n_blocks = n_blocks
-        self.matrix = [[None for _ in range(n_blocks)] for _ in range(n_blocks)]
+        self.matrix = torch.nn.ModuleList(
+            [torch.nn.ModuleList([None for _ in range(n_blocks)]) for _ in range(n_blocks)]
+        )
 
-    def __getitem__(self, idx):
-        """
-        Get item using matrix-style indexing.
-
-        Parameters:
-            - idx (tuple): A tuple (i, j) representing the row and column indices.
-
-        Returns:
-            The module or value at position (i, j).
-        """
+    def __getitem__(self, idx: tuple):
         if not isinstance(idx, tuple) or len(idx) != 2:
             raise IndexError("Index must be a tuple (i, j)")
-        
         i, j = idx
         if not (0 <= i < self.n_blocks) or not (0 <= j < self.n_blocks):
             raise IndexError("Index out of bounds")
-        
         return self.matrix[i][j]
 
-    def __setitem__(self, idx, value):
-        """
-        Set item using matrix-style indexing.
-
-        Args:
-            idx (tuple): A tuple (i, j) representing the row and column indices.
-            value: The value to set at position (i, j).
-        """
+    def __setitem__(self, idx: tuple, value: torch.nn.Module):
         if not isinstance(idx, tuple) or len(idx) != 2:
             raise IndexError("Index must be a tuple (i, j)")
-        
         i, j = idx
-        if not (0 <= i < self.n_blocks) or not (0 <= j < self.n_blocks):
-            raise IndexError("Index out of bounds")
-        
+        if not (0 <= i < self.n_blocks):
+            raise IndexError(f"Index {i} out of bounds for n_blocks {self.n_blocks}")
+        if not (0 <= j < self.n_blocks):
+            raise IndexError(f"Index {j} out of bounds for n_blocks {self.n_blocks}")
+        if i == j:
+            assert isinstance(value, RecurrentLayer), "Diagonal blocks must be an instance of nn4n.nn.RecurrentLayer"
+        else:
+            assert isinstance(value, torch.nn.Module), "Off-diagonal blocks must be an instance of torch.nn.Module"
+            assert not isinstance(value, RecurrentLayer), "Off-diagonal blocks cannot be an instance of nn4n.nn.RecurrentLayer"
         self.matrix[i][j] = value
 
-
 class BlockRecurrentLayer(torch.nn.Module):
-    def __init__(
-        self,
-        n_blocks: int,
-    ):
-        """
-        Hidden layer of the network. The layer is initialized by passing specs in layer_struct.
-
-        Parameters:
-            - n_blocks: number of blocks in the layer
-        """
+    def __init__(self, n_blocks: int, **kwargs):
         super().__init__()
-        self.blocks = BlockMatrix(n_blocks)
-
+        self.block_recurrent = BlockMatrix(n_blocks=n_blocks)
+        self.initialized = False
+    
     @property
-    def n_blocks(self) -> int:
-        return self.blocks.n_blocks
+    def size(self) -> int:
+        return sum(self.block_sizes())
 
     @property
-    def size(self) -> int:
-        return sum(self.list_sizes())
+    def n_blocks(self) -> int:
+        return self.block_recurrent.n_blocks
+    
+    def block_indices(self, block_idx: int) -> torch.Tensor:
+        ranges = self.block_ranges[block_idx]
+        return torch.arange(ranges[0], ranges[1])
+
+    def _compute_block_ranges(self):
+        block_ranges = []
+        start_idx = 0
+        for block_idx in range(self.n_blocks):
+            block_size = self.block_sizes()[block_idx]
+            if block_size == 0:
+                block_ranges.append(None)
+            else:
+                block_ranges.append((start_idx, start_idx + block_size))
+                start_idx += block_size
+        return block_ranges
+
+    def block_sizes(self) -> List[int]:
+        block_sizes = []
+        for block_idx in range(self.n_blocks):
+            diagonal_block = self.block_recurrent[block_idx, block_idx]
+            block_size = diagonal_block.size if diagonal_block is not None else 0
+            block_sizes.append(block_size)
+        return block_sizes
+    
+    def set_projection(self, from_idx, to_idx, layer):
+        # NOTE: this is "inversed" which is slightly confusing
+        self[to_idx, from_idx] = layer
+
+    def set_recurrent(self, idx, layer):
+        self[idx, idx] = layer
+    
+    def __getitem__(self, idx: tuple):
+        return self.block_recurrent[idx]
+    
+    def __setitem__(self, idx: tuple, value: torch.nn.Module):
+        # Set the value then check the network is initialized every time
+        self.block_recurrent[idx] = value
+        self.initialized = all(isinstance(self.block_recurrent[i, i], RecurrentLayer) for i in range(self.n_blocks))
+        self.block_ranges = self._compute_block_ranges()
 
-    def list_sizes(self) -> List[int]:
+    # FORWARD
+    # =================================================================================
+    def _parse_fr_v(self, fr: torch.Tensor, v: torch.Tensor) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
         """
-        Get the sizes of the blocks
+        Parse the fr and v into a list of tensors
         """
-        return [self.blocks[i, i].input_dim for i in range(self.n_blocks)]
-
-    def to(self, device):
-        """Move the network to the device (cpu/gpu)"""
-        super().to(device)
+        fr_list = []
+        v_list = []
         for i in range(self.n_blocks):
-            for j in range(self.n_blocks):
-                block = self.blocks[i, j]
-                if block is not None and isinstance(block, torch.nn.Module):
-                    block.to(device)
-        return self
+            fr_list.append(fr[:, self.block_indices(i)])
+            v_list.append(v[:, self.block_indices(i)])
+        return fr_list, v_list
 
-    # FORWARD
-    # =================================================================================
     def forward(
         self, 
-        fr_hid_t: torch.Tensor,
-        v_hid_t: torch.Tensor, 
-        u_in_t: torch.Tensor
+        fr: torch.Tensor,
+        v: torch.Tensor, 
+        u_list: List[torch.Tensor]
     ) -> torch.Tensor:
         """
         Forwardly update network
 
         Parameters:
-            - fr_hid_t: hidden state (post-activation), shape: (batch_size, hidden_size)
-            - v_hid_t: hidden state (pre-activation), shape: (batch_size, hidden_size)
-            - u_in_t: input, shape: (batch_size, input_size)
+            - fr: hidden state (post-activation), shape: (batch_size, total_hidden_size)
+            - v: hidden state (pre-activation), shape: (batch_size, total_hidden_size)
+            - u_list: list of input tensors, each of shape (batch_size, input_dim)
 
         Returns:
-            - fr_t_next: hidden state (post-activation), shape: (batch_size, hidden_size)
-            - v_t_next: hidden state (pre-activation), shape: (batch_size, hidden_size)
+            - fr_t_next: hidden state (post-activation), shape: (batch_size, total_hidden_size)
+            - v_t_next: hidden state (pre-activation), shape: (batch_size, total_hidden_size)
         """
-        v_in_t = self.input_layer(u_in_t) if self.input_layer is not None else u_in_t
-        v_hid_t_next = self.linear_layer(fr_hid_t)
-        v_t_next = (1 - self.alpha) * v_hid_t + self.alpha * (v_hid_t_next + v_in_t)
-        if self.preact_noise > 0 and self.training:
-            _preact_noise = self._generate_noise(v_t_next.size(), self.preact_noise)
-            v_t_next = v_t_next + _preact_noise
-        fr_t_next = self.activation(v_t_next)
-        if self.postact_noise > 0 and self.training:
-            _postact_noise = self._generate_noise(fr_t_next.size(), self.postact_noise)
-            fr_t_next = fr_t_next + _postact_noise
-        return fr_t_next, v_t_next
+        if not self.initialized:
+            raise ValueError("BlockRecurrentLayer is not initialized. All diagonal blocks must be set before forward pass.")
+
+        fr_list, v_list = self._parse_fr_v(fr, v)
+        u_aux_list = [torch.zeros_like(_fr, device=_fr.device) for _fr in fr_list]
+        fr_n_list, v_n_list = [None for _ in range(self.n_blocks)], [None for _ in range(self.n_blocks)]
+
+        for from_idx in range(self.n_blocks):
+            for to_idx in range(self.n_blocks):
+                if to_idx == from_idx:
+                    continue
+                layer = self.block_recurrent[to_idx, from_idx]
+                if layer is not None:
+                    u_aux_list[to_idx] += layer(fr_list[from_idx])
+
+        for diag_idx in range(self.n_blocks):
+            layer = self.block_recurrent[diag_idx, diag_idx]
+            fr_n_list[diag_idx], v_n_list[diag_idx] = layer(fr_list[diag_idx], v_list[diag_idx], u_list[diag_idx], u_aux_list[diag_idx])
+
+        fr_next = torch.cat(fr_n_list, dim=-1)
+        v_next = torch.cat(v_n_list, dim=-1)
+        return fr_next, v_next
 
     # HELPER FUNCTIONS
     # ======================================================================================
     def plot_layer(self, **kwargs):
         """
         Plot the layer
         """
-        self.linear_layer.plot_layer(**kwargs)
-        if self.input_layer is not None:
-            self.input_layer.plot_layer(**kwargs)
+        raise NotImplementedError("Plotting is not implemented for BlockRecurrentLayer")
 
     def _get_specs(self):
         """
         Get specs of the layer
         """
-        return {
-            "input_dim": self.input_dim,
-            "output_dim": self.output_dim,
-            "hidden_size": self.hidden_size,
-            "alpha": self.alpha,
-            "learn_alpha": self.learn_alpha,
-            "preact_noise": self.preact_noise,
-            "postact_noise": self.postact_noise,
-        }
+        raise NotImplementedError("Getting specs is not implemented for BlockRecurrentLayer")
diff --git a/nn4n/nn/linear_layer.py b/nn4n/nn/linear_layer.py
@@ -149,4 +149,4 @@ def print_layer(self):
         """
         Print the specs of the layer
         """
-        utils.print_dict(f"{self.__class__.__name__} layer", self.get_specs())
+        utils.print_dict(f"{self.__class__.__name__} layer", self.get_specs())
diff --git a/nn4n/nn/module.py b/nn4n/nn/module.py
@@ -1,7 +1,5 @@
 import nn4n
 import torch
-import numpy as np
-
 
 class Module(torch.nn.Module):
     """
@@ -35,8 +33,9 @@ def __init__(self,
         self._enforce_positivity()
         self._balance_excitatory_inhibitory()
 
-        # Register the forward hook
+        # Register the forward and backward hooks
         self.register_forward_pre_hook(self.enforce_constraints)
+        self._register_backward_hooks()
 
     # INIT MASKS
     # ======================================================================================
@@ -74,8 +73,7 @@ def _balance_excitatory_inhibitory(self):
         ext_sum = self.weight[self.positivity_mask == 1].sum()
         inh_sum = self.weight[self.positivity_mask == -1].sum()
         if ext_sum == 0 or inh_sum == 0:
-            # Automatically stop balancing if one of the sums is 0
-            # devide by 10 to avoid recurrent explosion/decay
+            # Avoid explosions/decay by scaling everything down
             self.weight /= 10
         else:
             if ext_sum > abs(inh_sum):
@@ -113,6 +111,20 @@ def _enforce_positivity(self):
         w[self.positivity_mask.T == -1] = torch.clamp(w[self.positivity_mask.T == -1], max=0)
         self.weight.data.copy_(torch.nn.Parameter(w))
 
+    # BACKWARD HOOK
+    # ======================================================================================
+    def _register_backward_hooks(self):
+        """
+        Register hooks to modify gradients during backprop.
+        For example, zero out gradients for masked-out weights
+        to prevent updates in those positions.
+        """
+        if self.sparsity_mask is not None:
+            def hook_fn(grad):
+                # If a weight is masked out, its gradient is zeroed.
+                return grad * (self.sparsity_mask.T > 0).float()
+            self.weight.register_hook(hook_fn)
+
     # UTILITIES
     # ======================================================================================
     def set_weight(self, weight):
@@ -141,4 +153,4 @@ def plot_layer(self, plot_type="weight"):
                 w=weight.detach().numpy(),
                 title=f"Weight",
                 ignore_zeros=self.sparsity_mask is not None,
-            )
+            )
diff --git a/nn4n/nn/recurrent_layer.py b/nn4n/nn/recurrent_layer.py
@@ -36,27 +36,20 @@ def output_dim(self) -> int:
     def size(self) -> int:
         return self.leaky_layer.input_dim
 
-    def to(self, device: torch.device):
-        """Move the network to the device (cpu/gpu)"""
-        super().to(device)
-        self.device = device
-        self.leaky_layer.to(device)
-        if self.projection_layer is not None:
-            self.projection_layer.to(device)
-        return self
-
-    def forward(self, fr: torch.Tensor, v: torch.Tensor, u: torch.Tensor) -> torch.Tensor:
+    def forward(self, fr: torch.Tensor, v: torch.Tensor, u: torch.Tensor, u_aux: torch.Tensor = None):
         """
         Forwardly update network
 
         Parameters:
             - fr: hidden state (post-activation), shape: (batch_size, hidden_size)
             - v: hidden state (pre-activation), shape: (batch_size, hidden_size)
             - u: input, shape: (batch_size, input_size)
+            - u_aux: auxiliary input to be added after projection, shape: (batch_size, hidden_size)
 
         Returns:
             - fr_next: hidden state (post-activation), shape: (batch_size, hidden_size)
             - v_next: hidden state (pre-activation), shape: (batch_size, hidden_size)
         """
         u = self.projection_layer(u) if self.projection_layer is not None else u
+        u = u + u_aux if u_aux is not None else u
         return self.leaky_layer(fr, v, u)
diff --git a/nn4n/nn/rnn.py b/nn4n/nn/rnn.py
diff --git a/todo.md b/todo.md