-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparallelalphazero.py
212 lines (163 loc) · 7.6 KB
/
parallelalphazero.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
Contains the AlphazeroParallel class for training a model to play TicTacToe or ConnectFour
with games run parallely to speed up training
"""
# Importing dependencies
import numpy as np
from tqdm import trange
from random import shuffle
import torch
import torch.nn.functional as F
from alphamontecarlo import AlphaNode
# Defining an Alpha MCTS Parallel Class
class AlphaMCTSParallel:
def __init__(self, game, args, model):
self.game = game
self.args = args
self.model = model
"""
The search function should perform the three steps of Alpha Monte Carlo Tree Search:
1. Selection
2. Expansion
3. Backpropagation
This is wrapped with no grad to ensure we don't change gradients accidentally
"""
@torch.no_grad()
def search(self, states, spGames):
policy, _ = self.model(
torch.tensor(self.game.get_encoded_state(states), device=self.model.device)
)
policy = torch.softmax(policy, axis=1).cpu().numpy()
# Create noise for each state with size parameter
policy = (1 - self.args['dirichlet_epsilon']) * policy + self.args['dirichlet_epsilon'] * \
np.random.dirichlet([self.args['dirichlet_alpha']] * self.game.action_size, size=policy.shape[0])
for i, spg in enumerate(spGames):
spg_policy = policy[i]
valid_moves = self.game.get_valid_moves(states[i])
spg_policy *= valid_moves
spg_policy /= np.sum(spg_policy)
spg.root = AlphaNode(self.game, self.args, states[i], visit_count=1)
spg.root.expand(spg_policy)
for search in range(self.args['num_searches']):
for spg in spGames:
spg.node = None
node = spg.root
# Selection
while node.is_fully_expanded():
node = node.select()
value, is_terminated = self.game.check_win_and_termination(node.state, node.action_taken)
value = self.game.get_opponent_value(value)
# Do backpropagation if you reach a terminal node
if is_terminated:
node.backpropagate(value)
else:
spg.node = node
expandable_spGames = [mappingIdx for mappingIdx in range(len(spGames))
if spGames[mappingIdx].node is not None]
if len(expandable_spGames) > 0:
states = np.stack([spGames[mappingIdx].node.state for mappingIdx in expandable_spGames])
policy, value = self.model(
torch.tensor(self.game.get_encoded_state(states), device=self.model.device)
)
policy = torch.softmax(policy, axis=1).cpu().numpy()
value = value.cpu().numpy()
for i, mappingIdx in enumerate(expandable_spGames):
# Expand along expandable games
spg_policy, spg_value = policy[i], value[i]
node = spGames[mappingIdx].node
valid_moves = self.game.get_valid_moves(node.state)
spg_policy *= valid_moves
spg_policy /= np.sum(spg_policy)
# Expansion
node.expand(spg_policy)
node.backpropagate(spg_value)
# Class Definition of AlphaZero
class AlphaZeroParallel:
"""
The model is ResNet while the optimizer would be Adam
"""
def __init__(self, model, optimizer, game, args):
self.model = model
self.optimizer = optimizer
self.game = game
self.args = args
self.mcts = AlphaMCTSParallel(game, args, model)
"""
Model plays against itself
"""
def selfPlay(self):
return_memory = []
player = 1
spGames = [SPG(self.game) for _ in range(self.args['num_parallel_games'])]
while len(spGames) > 0:
states = np.stack([spg.state for spg in spGames])
neutral_states = self.game.change_perspective(states, player)
self.mcts.search(neutral_states, spGames)
for i in range(len(spGames))[::-1]:
spg = spGames[i]
# Get the probabilities for the different actions
action_probs = np.zeros(self.game.action_size)
for child in spg.root.children:
action_probs[child.action_taken] = child.visit_count
action_probs /= np.sum(action_probs)
spg.memory.append((spg.root.state, action_probs, player))
temperature_action_probs = action_probs ** (1 / self.args['temperature'])
temperature_action_probs /= np.sum(temperature_action_probs)
action = np.random.choice(self.game.action_size, p=temperature_action_probs)
spg.state = self.game.get_next_state(spg.state, action, player)
value, is_terminated = self.game.check_win_and_termination(spg.state, action)
if is_terminated:
for hist_neutral_state, hist_action_probs, hist_player in spg.memory:
hist_outcome = value if hist_player == player else self.game.get_opponent_value(value)
return_memory.append(
(
self.game.get_encoded_state(hist_neutral_state),
hist_action_probs,
hist_outcome
)
)
del(spGames[i])
player = self.game.get_opponent(player)
return return_memory
"""
Training method for the model where batches are shuffled
"""
def train(self, memory):
shuffle(memory)
for batchIdx in range(0, len(memory), self.args['batch_size']):
sample = memory[batchIdx:min(len(memory) - 1, batchIdx + self.args['batch_size'])]
state, policy_targets, value_targets = zip(*sample)
state, policy_targets, value_targets = np.array(state), np.array(policy_targets), \
np.array(value_targets).reshape(-1, 1)
state = torch.tensor(state, dtype=torch.float32, device=self.model.device)
policy_targets = torch.tensor(policy_targets, dtype=torch.float32, device=self.model.device)
value_targets = torch.tensor(value_targets, dtype=torch.float32, device=self.model.device)
output_policy, output_value = self.model(state)
policy_loss = F.cross_entropy(output_policy, policy_targets)
value_loss = F.mse_loss(output_value, value_targets)
loss = policy_loss + value_loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
"""
For each iteration create training data for one cycle, train the model and
store the state of the model
"""
def learn(self):
for iteration in range(self.args['num_iterations']):
memory = []
self.model.eval()
for _ in trange(self.args['num_selfPlay_iterations'] // self.args['num_parallel_games']):
memory += self.selfPlay()
self.model.train()
for _ in trange(self.args['num_epochs']):
self.train(memory)
torch.save(self.model.state_dict(), f"models/{self.game}/model_{iteration}.pt")
torch.save(self.optimizer.state_dict(), f"models/{self.game}/optimizer_{iteration}.pt")
# Class Definition for Self Playing Game
class SPG:
def __init__(self, game):
self.state = game.get_initial_state()
self.memory = []
self.root = None
self.node = None