@@ -50,39 +50,47 @@ def __init__(self, embedGame, embedAction, global_step, rlConfig, **kwargs):
50
50
51
51
self .actor = net
52
52
53
- def train (self , state , prev_action , action , prob , advantages , ** unused ):
53
+ def train (self , target_log_probs , advantages , ** unused ):
54
+ train_log_probs = target_log_probs [:,:- 1 ] # last state has no advantage
55
+ actor_gain = tf .reduce_mean (tf .mul (train_log_probs , advantages - self .entropy_scale ))
56
+
57
+ actor_params = self .actor .getVariables ()
58
+
59
+ def metric (log_p1 , log_p2 ):
60
+ return tf .reduce_mean (tf .squared_difference (log_p1 , log_p2 ))
61
+
62
+ return self .optimizer .optimize (- actor_gain , actor_params , target_log_probs , metric )
63
+
64
+ def probs (self , state , prev_action , action , prob , ** unused ):
54
65
embedded_state = self .embedGame (state )
55
66
embedded_prev_action = self .embedAction (prev_action )
56
67
history = RL .makeHistory (embedded_state , embedded_prev_action , self .rlConfig .memory )
57
-
58
- actor_probs = self .actor (history )
59
- log_actor_probs = tf .log (actor_probs )
60
68
69
+ actions = self .embedAction (action [:,self .rlConfig .memory :])
70
+
71
+ actor_probs = self .actor (history )
72
+ real_actor_probs = tfl .batch_dot (actions , actor_probs )
73
+
74
+ """
61
75
entropy = - tfl.batch_dot(actor_probs, log_actor_probs)
62
76
entropy_avg = tfl.power_mean(self.entropy_power, entropy)
63
77
tf.scalar_summary('entropy_avg', entropy_avg)
64
78
tf.scalar_summary('entropy_min', tf.reduce_min(entropy))
65
79
tf.histogram_summary('entropy', entropy)
66
-
67
- actions = self .embedAction (action [:,self .rlConfig .memory :])
68
-
69
- real_actor_probs = tfl .batch_dot (actions , actor_probs )
70
- prob_ratios = prob [:,self .rlConfig .memory :] / real_actor_probs
71
- tf .scalar_summary ('kl' , tf .reduce_mean (tf .log (prob_ratios )))
72
-
73
- real_log_actor_probs = tfl .batch_dot (actions , log_actor_probs )
74
- train_log_actor_probs = real_log_actor_probs [:,:- 1 ] # last state has no advantage
75
- actor_gain = tf .reduce_mean (tf .mul (train_log_actor_probs , tf .stop_gradient (advantages )))
76
- #tf.scalar_summary('actor_gain', actor_gain)
80
+ """
77
81
78
- actor_loss = - ( actor_gain + self . entropy_scale * entropy_avg )
82
+ tf . scalar_summary ( 'entropy_avg' , - tf . reduce_mean ( tf . log ( prob )) )
79
83
80
- actor_params = self .actor . getVariables ()
81
-
82
- def metric ( p1 , p2 ):
83
- return tf .reduce_mean (tfl . kl ( p1 , p2 ))
84
+ behavior_probs = prob [:, self .rlConfig . memory :]
85
+ ratios = real_actor_probs / behavior_probs
86
+
87
+ tf . scalar_summary ( 'kl' , - tf .reduce_mean (tf . log ( ratios ) ))
84
88
85
- return self .optimizer .optimize (actor_loss , actor_params , log_actor_probs , metric )
89
+ return dict (
90
+ target_probs = real_actor_probs ,
91
+ target_log_probs = tf .log (real_actor_probs ),
92
+ ratios = ratios
93
+ )
86
94
87
95
def getPolicy (self , state , ** unused ):
88
96
return self .actor (state )
0 commit comments