feat: batch size implementation

clemkoa · clemkoa · commit ab033f4ff5bb · 2020-05-23T16:04:46.000+10:00
diff --git a/copy_task.py b/copy_task.py
@@ -53,6 +53,7 @@ def train(epochs=50_000):
     writer.add_scalar("hidden_layer_size", hidden_layer_size)
     writer.add_scalar("lstm_controller", lstm_controller)
     writer.add_scalar("seed", seed)
+    writer.add_scalar("batch_size", batch_size)
 
     model = NTM(vector_length, hidden_layer_size, memory_size, lstm_controller)
 
@@ -76,7 +77,7 @@ def train(epochs=50_000):
             _, state = model(vector, state)
         y_out = torch.zeros(target.size())
         for j in range(len(target)):
-            y_out[j], state = model(torch.zeros(1, vector_length + 1), state)
+            y_out[j], state = model(torch.zeros(batch_size, vector_length + 1), state)
         loss = F.binary_cross_entropy(y_out, target)
         loss.backward()
         optimizer.step()
diff --git a/ntm/head.py b/ntm/head.py
@@ -32,17 +32,23 @@ def get_head_weight(self, x, previous_state, memory_read):
         w_c = F.softmax(beta * F.cosine_similarity(memory_read + 1e-16, k.unsqueeze(1) + 1e-16, dim=-1), dim=1)
         # Focusing by location
         w_g = g * w_c + (1 - g) * previous_state
-        w_t = _convolve(w_g, s)
+        w_t = self.shift(w_g, s)
         w = w_t ** gamma
         w = torch.div(w, torch.sum(w, dim=1).view(-1, 1) + 1e-16)
         return w
 
+    def shift(self, w_g, s):
+        result = torch.zeros(w_g.size())
+        for b in range(len(w_g)):
+            result[b] = _convolve(w_g[b], s[b])
+        return result
+
 
 class ReadHead(Head):
     def forward(self, x, previous_state):
         memory_read = self.memory.read()
         w = self.get_head_weight(x, previous_state, memory_read)
-        return torch.matmul(w, memory_read), w
+        return torch.matmul(w.unsqueeze(1), memory_read).squeeze(1), w
 
 
 class WriteHead(Head):
diff --git a/ntm/memory.py b/ntm/memory.py
@@ -7,7 +7,7 @@ class Memory(nn.Module):
     def __init__(self, memory_size):
         super(Memory, self).__init__()
         self._memory_size = memory_size
-        print(self._memory_size)
+
         # Initialize memory bias
         stdev = 1 / (np.sqrt(memory_size[0] + memory_size[1]))
         intial_state = torch.Tensor(memory_size[0], memory_size[1]).uniform_(-stdev, stdev)
@@ -29,8 +29,8 @@ def read(self):
         return self.memory
 
     def write(self, w, e, a):
-        self.memory = self.memory * (1 - torch.t(w) * e)
-        self.memory = self.memory + torch.t(w) * a
+        self.memory = self.memory * (1 - torch.matmul(torch.t(w), e))
+        self.memory = self.memory + torch.matmul(torch.t(w), a)
         return self.memory
 
     def size(self):
diff --git a/ntm/ntm.py b/ntm/ntm.py
@@ -17,12 +17,11 @@ def __init__(self, vector_length, hidden_size, memory_size, lstm_controller=True
         nn.init.xavier_uniform_(self.fc.weight, gain=1)
         nn.init.normal_(self.fc.bias, std=0.01)
 
-    def get_initial_state(self, batch_size):
+    def get_initial_state(self, batch_size=1):
         self.memory.reset(batch_size)
         controller_state = self.controller.get_initial_state(batch_size)
         read = self.memory.get_initial_state(batch_size)
         read_head_state = self.read_head.get_initial_state(batch_size)
-        print("read_head_state.shape", read_head_state.shape)
         write_head_state = self.write_head.get_initial_state(batch_size)
         return (read, read_head_state, write_head_state, controller_state)
 
diff --git a/ntm/utils.py b/ntm/utils.py
@@ -15,12 +15,9 @@ def circular_convolution(w, s):
 
 def _convolve(w, s):
     """Circular convolution implementation."""
-    assert s.size(1) == 3
-    print(w.shape)
-    t = torch.cat([w[:, -1:], w, w[:, :1]], dim=1)
-    print(t.shape)
-    c = F.conv1d(t.unsqueeze(1), s.view(1, 1, -1))
-    print(c.shape)
+    assert s.size(0) == 3
+    t = torch.cat([w[-1:], w, w[:1]], dim=0)
+    c = F.conv1d(t.view(1, 1, -1), s.view(1, 1, -1)).view(-1)
     return c
 
 
diff --git a/repeat_task.py b/repeat_task.py
@@ -23,21 +23,20 @@
 # torch.manual_seed(seed)
 
 
-def get_training_sequence(sequence_min_length, sequence_max_length, vector_length):
+def get_training_sequence(sequence_min_length, sequence_max_length, vector_length, batch_size=1):
     sequence_length = random.randint(sequence_min_length, sequence_max_length)
     repeat = random.randint(sequence_min_length, sequence_max_length)
 
-    target = torch.bernoulli(torch.Tensor(sequence_length, vector_length).uniform_(0, 1))
-    target = torch.unsqueeze(target, 1)
+    target = torch.bernoulli(torch.Tensor(sequence_length, batch_size, vector_length).uniform_(0, 1))
 
-    input = torch.zeros(sequence_length + 2, 1, vector_length + 2)
+    input = torch.zeros(sequence_length + 2, batch_size, vector_length + 2)
     input[:sequence_length, :, :vector_length] = target
     # delimiter vector
     input[sequence_length, :, vector_length] = 1.0
     # repeat channel
-    input[sequence_length+1, :, vector_length+1] = repeat / sequence_max_length
+    input[sequence_length + 1, :, vector_length + 1] = repeat / sequence_max_length
 
-    output = torch.zeros(sequence_length * repeat + 1, 1, vector_length + 1)
+    output = torch.zeros(sequence_length * repeat + 1, batch_size, vector_length + 1)
     output[:sequence_length * repeat, :, :vector_length] = target.clone().repeat(repeat, 1, 1)
     # delimiter vector
     output[-1, :, -1] = 1.0
@@ -53,6 +52,7 @@ def train(epochs=50_000):
     vector_length = 8
     memory_size = (128, 20)
     hidden_layer_size = 100
+    batch_size = 2
     lstm_controller = not args.ff
 
     writer.add_scalar("sequence_min_length", sequence_min_length)
@@ -63,6 +63,7 @@ def train(epochs=50_000):
     writer.add_scalar("hidden_layer_size", hidden_layer_size)
     writer.add_scalar("lstm_controller", lstm_controller)
     writer.add_scalar("seed", seed)
+    writer.add_scalar("batch_size", batch_size)
 
     model = NTM(vector_length + 1, hidden_layer_size, memory_size, lstm_controller)
 
@@ -78,15 +79,15 @@ def train(epochs=50_000):
         checkpoint = torch.load(model_path)
         model.load_state_dict(checkpoint)
 
-    for epoch in range(epochs):
+    for epoch in range(epochs + 1):
         optimizer.zero_grad()
-        input, target = get_training_sequence(sequence_min_length, sequence_max_length, vector_length)
-        state = model.get_initial_state()
+        input, target = get_training_sequence(sequence_min_length, sequence_max_length, vector_length, batch_size)
+        state = model.get_initial_state(batch_size)
         for vector in input:
             _, state = model(vector, state)
         y_out = torch.zeros(target.size())
         for j in range(len(target)):
-            y_out[j], state = model(torch.zeros(1, vector_length + 2), state)
+            y_out[j], state = model(torch.zeros(batch_size, vector_length + 2), state)
         loss = F.binary_cross_entropy(y_out, target)
         loss.backward()
         optimizer.step()
@@ -95,7 +96,7 @@ def train(epochs=50_000):
         y_out_binarized.apply_(lambda x: 0 if x < 0.5 else 1)
         cost = torch.sum(torch.abs(y_out_binarized - target)) / len(target)
         total_cost.append(cost.item())
-        if epoch % feedback_frequence == feedback_frequence - 1:
+        if epoch % feedback_frequence == 0:
             running_loss = sum(total_loss) / len(total_loss)
             running_cost = sum(total_cost) / len(total_cost)
             print(f"Loss at step {epoch}: {running_loss}")
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,6 +1,6 @@
 import torch
 
-from ..ntm.utils import circular_convolution
+from ..ntm.utils import circular_convolution, _convolve
 
 
 def test_circular_convolution():
@@ -23,3 +23,15 @@ def test_circular_convolution():
     b = torch.tensor([[1, 0, 1, 0, 0]])
     res = torch.tensor([[5, 7, 4, 6, 8]])
     assert torch.equal(circular_convolution(a, b), res)
+
+
+def test_convolve():
+    w = torch.tensor([0, 0, 1, 0, 0])
+    s = torch.tensor([0, 1, 0])
+    res = torch.tensor([0, 0, 1, 0, 0])
+    assert torch.equal(_convolve(w, s), res)
+
+    w = torch.tensor([0, 0, 1.0, 0, 0])
+    s = torch.tensor([0.5, 0, 0.5])
+    res = torch.tensor([0, 0.5, 0, 0.5, 0])
+    assert torch.equal(_convolve(w, s), res)