initial-h
diff --git a/‎README.md
+58 b/‎README.md
+58
diff --git a/‎agents.py
+149 b/‎agents.py
+149
diff --git a/‎arguments.py
+45 b/‎arguments.py
+45
@@ -0,0 +1,58 @@
+## Replay Memory as An Empirical MDP: Combining Conservative Estimation with Experience Replay
+
+![overview](D:/wakaka/2022/structured_memory/ICLR2023/github_version/ceer/pic/overview.svg)
+
+### Overview
+
+- PyTorch implementation of Conservative Estimation with Experience Replay ([CEER](https://openreview.net/forum?id=SjzFVSJUt8S)). 
+
+- Method is tested on [Sokoban](https://github.com/mpSchrader/gym-sokoban), [Minigrid](https://github.com/Farama-Foundation/Minigrid) and [MinAtar](https://github.com/kenjyoung/MinAtar) environments.
+
+### Installation
+```
+pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
+pip install -r requirements.txt
+```
+- My Python version is 3.7.11. CUDA version is 11.4.
+
+### Running Experiments
+
+```
+python ceer/main.py
+```
+- Modify `atari_name_list` in `ceer/arguments.py` for different environments.
+
+- For example, `'atari_name_list': ['Sokoban-Push_5x5_1_120']`.
+
+- Other parameters like `sample_method_para # alpha`,`policy_loss_para # lambda` are also in `ceer/arguments.py`.
+  
+ ### Bibtex
+```
+@inproceedings{
+zhang2023replay,
+title={Replay Memory as An Empirical {MDP}: Combining Conservative Estimation with Experience Replay},
+author={Hongming Zhang and Chenjun Xiao and Han Wang and Jun Jin and bo xu and Martin M{\"u}ller},
+booktitle={The Eleventh International Conference on Learning Representations },
+year={2023},
+url={https://openreview.net/forum?id=SjzFVSJUt8S}
+}
+```
+ 
+### Acknowledgments
+
+- Awesome Environments used for testing:
+
+  Sokoban: https://github.com/mpSchrader/gym-sokoban
+
+  Minigrid: https://github.com/Farama-Foundation/Minigrid
+  
+  MinAtar: https://github.com/kenjyoung/MinAtar
+
+
+- Some baselines can be found in following works:
+ 
+  TER: https://openreview.net/forum?id=OXRZeMmOI7a
+  
+  Dreamerv2: https://github.com/RajGhugare19/dreamerv2
+  
+  Tianshou: https://github.com/thu-ml/tianshou
@@ -0,0 +1,149 @@
+import copy
+import time
+
+import utils
+from env_wrappers import *
+from collections import deque
+import numpy as np
+import torch
+import torch.nn.functional as F
+import hashlib
+import pickle
+from rl_algorithms import TD
+from schedules import LinearSchedule
+from buffers import BatchBuffer,Graph_buffer
+
+class DQN_Agent():
+    def __init__(self,env,net,args_dict):
+        self.game_env = env
+        self.args_dict = args_dict
+        self.action_space = self.game_env.action_space
+        self.action_space_set = set(range(self.action_space))
+        self.state_space = self.game_env.observation_space
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # self.device = "cpu"
+        self.net = net(self.action_space,self.state_space,args_dict['atari_name']).to(self.device)
+        self.target_net = net(self.action_space,self.state_space,args_dict['atari_name']).to(self.device)
+        self.target_net.load_state_dict(self.net.state_dict())
+
+        self.exploration_decay = LinearSchedule(schedule_timesteps=args_dict['decay_step'],final_p=args_dict['exploration_final_eps'],initial_p=args_dict['exploration_initial_eps'])
+        self.lr_decay = LinearSchedule(schedule_timesteps=args_dict['final_step'], final_p=0.)
+        self.update = TD(self.net,self.target_net, self.lr_decay,self.device,args_dict)
+
+        self.graph_buffer = Graph_buffer(args_dict,action_space=self.action_space) # CEER
+        self.batch_buffer = BatchBuffer(args_dict) # DQN
+        self.current_episode = [[] for _ in range(args_dict['number_env'])]
+
+        self.max_q_mean = 0
+        self.all_q_mean = 0
+        self.density = 0
+
+    def save_model(self,path):
+        torch.save(self.net.state_dict(), path)
+        # torch.save(self.net.node_dict(),path,_use_new_zipfile_serialization=False)
+
+    def load_model(self,path):
+        self.net.load_state_dict(torch.load(path))
+
+    def act(self,states,rewards,dones,infos,train,current_step):
+        # print([(s.dtype,s.shape) for s in states])
+        states_tensor = torch.from_numpy(np.array(states)).to(self.device).float()
+        # print(states)
+        # print(states_tensor.shape)
+        with torch.no_grad():
+            q_values = self.net(states_tensor)
+        q_values = q_values.detach().cpu().numpy()
+        # print('q_values :',q_values.shape)
+        actions = []
+        if train:
+            epsilon = self.exploration_decay.value(current_step)
+            exploration_list = np.random.random(self.args_dict['number_env']) < epsilon
+            for i in range(self.args_dict['number_env']):
+                # print('number :', i)
+                # print(args.number_env, q_values.shape, q_values[i], states_tensor.shape)
+                if exploration_list[i]:
+                    actions.append(np.random.randint(self.action_space))
+                else:
+                    actions.append(np.argmax(q_values[i]))
+
+            self.train(states,actions,rewards,dones,infos,current_step)
+        else:
+            exploration_list = np.random.random(self.args_dict.number_env) < 0.01  # 0.05
+            for i in range(self.args_dict['number_env']):
+                if exploration_list[i]:
+                    actions.append(np.random.randint(self.action_space))
+                else:
+                    actions.append(np.argmax(q_values[i]))
+                # actions = np.argmax(q_values,axis=1)
+        # print(q_values)
+        return actions
+
+    def train(self,states,actions,rewards,dones,infos,current_step):
+        if self.args_dict.sample_method != 'uniform':
+            if rewards is None:
+                self.s_t = states
+                self.a_t = actions
+            else:
+                s_t_key_list = []
+                for i in range(self.args_dict['number_env']):
+                    if dones[i]:
+                        s_t_key = hashlib.md5(pickle.dumps(self.s_t[i])).hexdigest() + str(False)
+                        s_t1_key = hashlib.md5(pickle.dumps(infos[i]['terminal_state'])).hexdigest()+str(True)
+                        self.graph_buffer.add_data(self.s_t[i], self.a_t[i], rewards[i],dones[i],
+                                                   infos[i]['terminal_state'],s_t_key,s_t1_key)
+                        self.current_episode[i].reverse()
+                        self.graph_buffer.update_node(self.args_dict.batch_size,self.current_episode[i])
+                        self.current_episode[i] = []
+                    else:
+                        s_t_key = hashlib.md5(pickle.dumps(self.s_t[i])).hexdigest()+str(False)
+                        s_t1_key = hashlib.md5(pickle.dumps(states[i])).hexdigest()+str(False)
+                        self.graph_buffer.add_data(self.s_t[i], self.a_t[i], rewards[i],dones[i],states[i],
+                                                   s_t_key,s_t1_key)
+                        self.current_episode[i].append(s_t_key)
+                    s_t_key_list.append(s_t_key)
+
+                self.s_t = states
+                self.a_t = actions
+        else:
+            if rewards is None and dones is None:
+                for i in range(self.batch_buffer.buffer_num):
+                    self.batch_buffer.buffer_list[i].add_data(state_t=states[i],action_t=actions[i])
+            else:
+                for i in range(self.batch_buffer.buffer_num):
+                    self.batch_buffer.buffer_list[i].add_data(
+                        state_t=states[i],
+                        action_t=actions[i],
+                        reward_t=rewards[i],
+                        terminal_t=dones[i])
+
+        if current_step % self.args_dict['target_update_interval'] == 0:
+            self.target_net.load_state_dict(self.net.state_dict())
+
+        if current_step >= self.args_dict['learning_starts']:
+            # print(np.shape(self.batch_buffer.buffer_list))
+            if self.args_dict['sample_method'] != 'uniform':
+                for _ in range(self.args_dict['batch_num']):
+                    s_t, a_t, r_t, t_t, s_t1, target_q_t, updated_t1,\
+                    all_target_q_t,not_exist_action_value = self.graph_buffer.sample_batch(self.args_dict.batch_size)
+
+                    s_t, one_hot_a_t, index, r_t, t_t, s_t1 = self.update.np2torch(
+                        self.args_dict.batch_size, self.action_space, s_t, a_t, r_t, t_t, s_t1)
+
+                    max_q_mean,all_q_mean,density = self.update.learn(self.args_dict.sample_method,
+                                                                      self.graph_buffer,self.args_dict.batch_size,self.action_space,
+                                                                      s_t, one_hot_a_t, r_t, t_t, s_t1,target_q_t,updated_t1,
+                                                                      all_target_q_t,not_exist_action_value,self.args_dict.policy_loss_para)
+
+                self.max_q_mean = max_q_mean
+                self.all_q_mean = all_q_mean
+                self.density = density
+            else:
+                for _ in range(self.args_dict.batch_num):
+                    n = int(self.args_dict.batch_size / self.args_dict.number_env)
+                    s_t, a_t, r_t, t_t, s_t1 = self.batch_buffer.sample_batch(current_step,n)
+                    # print('state:',s_t)
+                    s_t,one_hot_a_t,index,r_t,t_t,s_t1 = self.update.np2torch(
+                        self.args_dict.batch_size,self.action_space,s_t, a_t, r_t, t_t, s_t1)
+                    self.update.learn(self.args_dict.sample_method,None,self.args_dict.batch_size,self.action_space, s_t,one_hot_a_t, r_t, t_t, s_t1)
+                    # print('data shape:', s_t.shape, a_t.shape, ret.shape, v.shape,logp.shape, adv.shape)
+                    # print('data type:', s.dtype, a.dtype, ret.dtype, v.dtype,logp.dtype, adv.dtype)
@@ -0,0 +1,45 @@
+# import pynvml
+# pynvml.nvmlInit()
+
+# gpu_num = pynvml.nvmlDeviceGetCount()
+# if gpu_num:
+#     CUDA_VISIBLE_DEVICES = gpu_num - 1 # use the last gpu
+    # print('CUDA_VISIBLE_DEVICES:', CUDA_VISIBLE_DEVICES)
+CUDA_VISIBLE_DEVICES=0
+number_env = 8
+
+para_list_dict = {
+    'atari_name_list': ['Sokoban-Push_5x5_1_120'],
+    # 'atari_name_list': ['MinAtar/Asterix-v0','MinAtar/Breakout-v0','MinAtar/Freeway-v0','MinAtar/Seaquest-v0','MinAtar/SpaceInvaders-v0',
+    #                     'MiniGrid-DoorKey-6x6-v0','MiniGrid-Unlock-v0','MiniGrid-RedBlueDoors-6x6-v0','MiniGrid-SimpleCrossingS9N1-v0',
+    #                     'MiniGrid-SimpleCrossingS9N2-v0','MiniGrid-LavaCrossingS9N1-v0','MiniGrid-LavaCrossingS9N2-v0',
+    #                     'Sokoban-Push_5x5_1_120','Sokoban-Push_6x6_1_120','Sokoban-Push_7x7_1_120','Sokoban-Push_6x6_3_120',
+    #                     'Sokoban-Push_5x5_2_120','Sokoban-Push_6x6_2_120','Sokoban-Push_7x7_2_120'],
+    'network_type_list': ['large'],  # 'larger','large','medium','small','mlp'
+    'seed_list': [0], #  list(range(21))
+    'exploration_final_eps_list': [0.01],  # 0.1
+    'batch_size_list': [32],  # 32, 64, 128,256,512
+    'batch_num_list': [2], # replay ratio, 0.25, int(number_env*0.25)
+    'double_dqn_list': [False], # True, False
+    'update_time_list': [1], # number of updates for each batch
+    'sample_method_list': ['kl'], # uniform, kl
+    'sample_method_para_list':[0.], # [0.,0.2,0.5,0.8,1.]
+    'policy_loss_list':[True], # True, False
+    'policy_loss_para_list':[0.], # [0.,0.01,0.1,1.,2.,5.]
+    'tau_list': [1.],#  temperature for softmax 1.,0.1,0.01
+                  }
+
+final_step = 2e6 # 2e6
+learning_rate = 1e-4 # 3e-3,1e-3,3e-4,1e-4,3e-5,1e-5
+buffer_size = int(final_step/20) # int(5e4) # int(final_step/20) # int(1e5) # 1_000_000,1e6,1e5 80 int(1e5/number_env)
+learning_starts = final_step*0.005 # 100 10000
+gamma = 0.99
+target_update_interval= 1000
+decay_step = final_step/2 # fraction of entire training period over which the exploration rate is reduced
+exploration_initial_eps = 1.0
+max_grad_norm = 10.
+test_num = 100
+
+FullyObs_minigrid = True
+deterministic = False
+fix_difficulty = False