# Gathering all the parameters (that we can modify to explore)
classParams():
def__init__(self):
self.lr=0.0001
self.gamma=0.99
self.tau=1.
self.seed=1
self.num_processes=16
self.num_steps=20
self.max_episode_length=10000
self.env_name='Breakout-v0'
# Main run
os.environ['OMP_NUM_THREADS']='1'# 1 thread per core
params=Params()# creating the params object from the Params class, that sets all the model parameters
torch.manual_seed(params.seed)# setting the seed (not essential)
env=create_atari_env(params.env_name)# we create an optimized environment thanks to universe
shared_model=ActorCritic(env.observation_space.shape[0],env.action_space)# shared_model is the model shared by the different agents (different threads in different cores)
shared_model.share_memory()# storing the model in the shared memory of the computer, which allows the threads to have access to this shared memory even if they are in different cores
optimizer=my_optim.SharedAdam(shared_model.parameters(),lr=params.lr)# the optimizer is also shared because it acts on the shared model
optimizer.share_memory()# same, we store the optimizer in the shared memory so that all the agents can have access to this shared memory to optimize the model
processes=[]# initializing the processes with an empty list
p=mp.Process(target=test,args=(params.num_processes,params,shared_model))# allowing to create the 'test' process with some arguments 'args' passed to the 'test' target function - the 'test' process doesn't update the shared model but uses it on a part of it - torch.multiprocessing.Process runs a function in an independent thread
p.start()# starting the created process p
processes.append(p)# adding the created process p to the list of processes
forrankinrange(0,params.num_processes):# making a loop to run all the other processes that will be trained by updating the shared model
forpinprocesses:# creating a pointer that will allow to kill all the threads when at least one of the threads, or main.py will be killed, allowing to stop the program safely
self.lstm=nn.LSTMCell(32*3*3,256)# making an LSTM (Long Short Term Memory) to learn the temporal properties of the input - we obtain a big encoded vector S of size 256 that encodes an event of the game
num_outputs=action_space.n# getting the number of possible actions
self.critic_linear=nn.Linear(256,1)# full connection of the critic: output = V(S)
self.actor_linear=nn.Linear(256,num_outputs)# full connection of the actor: output = Q(S,A)
self.apply(weights_init)# initilizing the weights of the model with random weights
self.actor_linear.weight.data=normalized_columns_initializer(self.actor_linear.weight.data,0.01)# setting the standard deviation of the actor tensor of weights to 0.01
self.actor_linear.bias.data.fill_(0)# initializing the actor bias with zeros
self.critic_linear.weight.data=normalized_columns_initializer(self.critic_linear.weight.data,1.0)# setting the standard deviation of the critic tensor of weights to 0.01
self.critic_linear.bias.data.fill_(0)# initializing the critic bias with zeros
self.lstm.bias_ih.data.fill_(0)# initializing the lstm bias with zeros
self.lstm.bias_hh.data.fill_(0)# initializing the lstm bias with zeros
self.train()# setting the module in "train" mode to activate the dropouts and batchnorms
defforward(self,inputs):
inputs,(hx,cx)=inputs# getting separately the input images to the tuple (hidden states, cell states)
x=F.elu(self.conv1(inputs))# forward propagating the signal from the input images to the 1st convolutional layer
x=F.elu(self.conv2(x))# forward propagating the signal from the 1st convolutional layer to the 2nd convolutional layer
x=F.elu(self.conv3(x))# forward propagating the signal from the 2nd convolutional layer to the 3rd convolutional layer
x=F.elu(self.conv4(x))# forward propagating the signal from the 3rd convolutional layer to the 4th convolutional layer
x=x.view(-1,32*3*3)# flattening the last convolutional layer into this 1D vector x
hx,cx=self.lstm(x,(hx,cx))# the LSTM takes as input x and the old hidden & cell states and ouputs the new hidden & cell states
x=hx# getting the useful output, which are the hidden states (principle of the LSTM)