...
 
Commits (2)
# Improvement of the Gym environment with universe
import cv2
import gym
import numpy as np
from gym.spaces.box import Box
from gym import wrappers
# Taken from https://github.com/openai/universe-starter-agent
def create_atari_env(env_id, video=False):
env = gym.make(env_id)
if video:
env = wrappers.Monitor(env, 'test', force=True)
env = MyAtariRescale42x42(env)
env = MyNormalizedEnv(env)
return env
def _process_frame42(frame):
frame = frame[34:34 + 160, :160]
# Resize by half, then down to 42x42 (essentially mipmapping). If
# we resize directly we lose pixels that, when mapped to 42x42,
# aren't close enough to the pixel boundary.
frame = cv2.resize(frame, (80, 80))
frame = cv2.resize(frame, (42, 42))
frame = frame.mean(2)
frame = frame.astype(np.float32)
frame *= (1.0 / 255.0)
#frame = np.reshape(frame, [1, 42, 42])
return frame
class MyAtariRescale42x42(gym.ObservationWrapper):
def __init__(self, env=None):
super(MyAtariRescale42x42, self).__init__(env)
self.observation_space = Box(0.0, 1.0, [1, 42, 42])
def _observation(self, observation):
return _process_frame42(observation)
class MyNormalizedEnv(gym.ObservationWrapper):
def __init__(self, env=None):
super(MyNormalizedEnv, self).__init__(env)
self.state_mean = 0
self.state_std = 0
self.alpha = 0.9999
self.num_steps = 0
def _observation(self, observation):
self.num_steps += 1
self.state_mean = self.state_mean * self.alpha + \
observation.mean() * (1 - self.alpha)
self.state_std = self.state_std * self.alpha + \
observation.std() * (1 - self.alpha)
unbiased_mean = self.state_mean / (1 - pow(self.alpha, self.num_steps))
unbiased_std = self.state_std / (1 - pow(self.alpha, self.num_steps))
ret = (observation - unbiased_mean) / (unbiased_std + 1e-8)
return np.expand_dims(ret, axis=0)
# Main code
from __future__ import print_function
import os
import torch
import torch.multiprocessing as mp
from envs import create_atari_env
from model import ActorCritic
from train import train
from test import test
import my_optim
# Gathering all the parameters (that we can modify to explore)
class Params():
def __init__(self):
self.lr = 0.0001
self.gamma = 0.99
self.tau = 1.
self.seed = 1
self.num_processes = 16
self.num_steps = 20
self.max_episode_length = 10000
self.env_name = 'Breakout-v0'
# Main run
os.environ['OMP_NUM_THREADS'] = '1'
params = Params()
torch.manual_seed(params.seed)
env = create_atari_env(params.env_name)
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
shared_model.share_memory()
optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr)
optimizer.share_memory()
processes = []
p = mp.Process(target=test, args=(params.num_processes, params, shared_model))
p.start()
processes.append(p)
for rank in range(0, params.num_processes):
p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
p.start()
processes.append(p)
for p in processes:
p.join()
# AI for Breakout
# Importing the librairies
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
# Initializing and setting the variance of a tensor of weights
def normalized_columns_initializer(weights, std=1.0):
out = torch.randn(weights.size())
out *= std / torch.sqrt(out.pow(2).sum(1, True))
return out
# Initializing the weights of the neural network in an optimal way for the learning
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
weight_shape = list(m.weight.data.size())
fan_in = np.prod(weight_shape[1:4])
fan_out = np.prod(weight_shape[2:4]) * weight_shape[0]
w_bound = np.sqrt(6. / (fan_in + fan_out))
m.weight.data.uniform_(-w_bound, w_bound)
m.bias.data.fill_(0)
elif classname.find('Linear') != -1:
weight_shape = list(m.weight.data.size())
fan_in = weight_shape[1]
fan_out = weight_shape[0]
w_bound = np.sqrt(6. / (fan_in + fan_out))
m.weight.data.uniform_(-w_bound, w_bound)
m.bias.data.fill_(0)
# Making the A3C brain
class ActorCritic(torch.nn.Module):
def __init__(self, num_inputs, action_space):
super(ActorCritic, self).__init__()
self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1)
self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
self.lstm = nn.LSTMCell(32 * 3 * 3, 256)
num_outputs = action_space.n
self.critic_linear = nn.Linear(256, 1)
self.actor_linear = nn.Linear(256, num_outputs)
self.apply(weights_init)
self.actor_linear.weight.data = normalized_columns_initializer(self.actor_linear.weight.data, 0.01)
self.actor_linear.bias.data.fill_(0)
self.critic_linear.weight.data = normalized_columns_initializer(self.critic_linear.weight.data, 1.0)
self.critic_linear.bias.data.fill_(0)
self.lstm.bias_ih.data.fill_(0)
self.lstm.bias_hh.data.fill_(0)
self.train()
def forward(self, inputs):
inputs, (hx, cx) = inputs
x = F.elu(self.conv1(inputs))
x = F.elu(self.conv2(x))
x = F.elu(self.conv3(x))
x = F.elu(self.conv4(x))
x = x.view(-1, 32 * 3 * 3)
hx, cx = self.lstm(x, (hx, cx))
x = hx
return self.critic_linear(x), self.actor_linear(x), (hx, cx)
# Optimizer
import math
import torch
import torch.optim as optim
class SharedAdam(optim.Adam):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'] = torch.zeros(1)
state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
def share_memory(self):
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'].share_memory_()
state['exp_avg'].share_memory_()
state['exp_avg_sq'].share_memory_()
def step(self):
loss = None
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step'][0]
bias_correction2 = 1 - beta2 ** state['step'][0]
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
return loss
# Test Agent
import torch
import torch.nn.functional as F
from envs import create_atari_env
from model import ActorCritic
from torch.autograd import Variable
import time
from collections import deque
def test(rank, params, shared_model):
torch.manual_seed(params.seed + rank)
env = create_atari_env(params.env_name, video=True)
env.seed(params.seed + rank)
model = ActorCritic(env.observation_space.shape[0], env.action_space)
model.eval()
state = env.reset()
state = torch.from_numpy(state)
reward_sum = 0
done = True
start_time = time.time()
actions = deque(maxlen=100)
episode_length = 0
while True:
episode_length += 1
if done:
model.load_state_dict(shared_model.state_dict())
cx = Variable(torch.zeros(1, 256), volatile=True)
hx = Variable(torch.zeros(1, 256), volatile=True)
else:
cx = Variable(cx.data, volatile=True)
hx = Variable(hx.data, volatile=True)
value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
prob = F.softmax(action_value)
action = prob.max(1)[1].data.numpy()
state, reward, done, _ = env.step(action[0])
reward_sum += reward
if done:
print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length))
reward_sum = 0
episode_length = 0
actions.clear()
state = env.reset()
time.sleep(60)
state = torch.from_numpy(state)
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000000.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 0}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000001.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 1}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000008.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 8}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000027.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 27}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000064.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 64}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000125.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 125}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000216.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 216}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000343.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 343}
\ No newline at end of file
# Training the AI
import torch
import torch.nn.functional as F
from envs import create_atari_env
from model import ActorCritic
from torch.autograd import Variable
def ensure_shared_grads(model, shared_model):
for param, shared_param in zip(model.parameters(), shared_model.parameters()):
if shared_param.grad is not None:
return
shared_param._grad = param.grad
def train(rank, params, shared_model, optimizer):
torch.manual_seed(params.seed + rank)
env = create_atari_env(params.env_name)
env.seed(params.seed + rank)
model = ActorCritic(env.observation_space.shape[0], env.action_space)
state = env.reset()
state = torch.from_numpy(state)
done = True
episode_length = 0
while True:
episode_length += 1
model.load_state_dict(shared_model.state_dict())
if done:
cx = Variable(torch.zeros(1, 256))
hx = Variable(torch.zeros(1, 256))
else:
cx = Variable(cx.data)
hx = Variable(hx.data)
values = []
log_probs = []
rewards = []
entropies = []
for step in range(params.num_steps):
value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
prob = F.softmax(action_values)
log_prob = F.log_softmax(action_values)
entropy = -(log_prob * prob).sum(1)
entropies.append(entropy)
action = prob.multinomial().data
log_prob = log_prob.gather(1, Variable(action))
values.append(value)
log_probs.append(log_prob)
state, reward, done, _ = env.step(action.numpy())
done = (done or episode_length >= params.max_episode_length)
reward = max(min(reward, 1), -1)
if done:
episode_length = 0
state = env.reset()
state = torch.from_numpy(state)
rewards.append(reward)
if done:
break
R = torch.zeros(1, 1)
if not done:
value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
R = value.data
values.append(Variable(R))
policy_loss = 0
value_loss = 0
R = Variable(R)
gae = torch.zeros(1, 1)
for i in reversed(range(len(rewards))):
R = params.gamma * R + rewards[i]
advantage = R - values[i]
value_loss = value_loss + 0.5 * advantage.pow(2)
TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
gae = gae * params.gamma * params.tau + TD
policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i]
optimizer.zero_grad()
(policy_loss + 0.5 * value_loss).backward()
torch.nn.utils.clip_grad_norm(model.parameters(), 40)
ensure_shared_grads(model, shared_model)
optimizer.step()
# Improvement of the Gym environment with universe
import cv2
import gym
import numpy as np
from gym.spaces.box import Box
from gym import wrappers
# Taken from https://github.com/openai/universe-starter-agent
def create_atari_env(env_id, video=False):
env = gym.make(env_id)
if video:
env = wrappers.Monitor(env, 'test', force=True)
env = MyAtariRescale42x42(env)
env = MyNormalizedEnv(env)
return env
def _process_frame42(frame):
frame = frame[34:34 + 160, :160]
# Resize by half, then down to 42x42 (essentially mipmapping). If
# we resize directly we lose pixels that, when mapped to 42x42,
# aren't close enough to the pixel boundary.
frame = cv2.resize(frame, (80, 80))
frame = cv2.resize(frame, (42, 42))
frame = frame.mean(2)
frame = frame.astype(np.float32)
frame *= (1.0 / 255.0)
#frame = np.reshape(frame, [1, 42, 42])
return frame
class MyAtariRescale42x42(gym.ObservationWrapper):
def __init__(self, env=None):
super(MyAtariRescale42x42, self).__init__(env)
self.observation_space = Box(0.0, 1.0, [1, 42, 42])
def _observation(self, observation):
return _process_frame42(observation)
class MyNormalizedEnv(gym.ObservationWrapper):
def __init__(self, env=None):
super(MyNormalizedEnv, self).__init__(env)
self.state_mean = 0
self.state_std = 0
self.alpha = 0.9999
self.num_steps = 0
def _observation(self, observation):
self.num_steps += 1
self.state_mean = self.state_mean * self.alpha + \
observation.mean() * (1 - self.alpha)
self.state_std = self.state_std * self.alpha + \
observation.std() * (1 - self.alpha)
unbiased_mean = self.state_mean / (1 - pow(self.alpha, self.num_steps))
unbiased_std = self.state_std / (1 - pow(self.alpha, self.num_steps))
ret = (observation - unbiased_mean) / (unbiased_std + 1e-8)
return np.expand_dims(ret, axis=0)
# Main code
from __future__ import print_function
import os
import torch
import torch.multiprocessing as mp
from envs import create_atari_env
from model import ActorCritic
from train import train
from test import test
import my_optim
# Gathering all the parameters (that we can modify to explore)
class Params():
def __init__(self):
self.lr = 0.0001
self.gamma = 0.99
self.tau = 1.
self.seed = 1
self.num_processes = 16
self.num_steps = 20
self.max_episode_length = 10000
self.env_name = 'Breakout-v0'
# Main run
os.environ['OMP_NUM_THREADS'] = '1' # 1 thread per core
params = Params() # creating the params object from the Params class, that sets all the model parameters
torch.manual_seed(params.seed) # setting the seed (not essential)
env = create_atari_env(params.env_name) # we create an optimized environment thanks to universe
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) # shared_model is the model shared by the different agents (different threads in different cores)
shared_model.share_memory() # storing the model in the shared memory of the computer, which allows the threads to have access to this shared memory even if they are in different cores
optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr) # the optimizer is also shared because it acts on the shared model
optimizer.share_memory() # same, we store the optimizer in the shared memory so that all the agents can have access to this shared memory to optimize the model
processes = [] # initializing the processes with an empty list
p = mp.Process(target=test, args=(params.num_processes, params, shared_model)) # allowing to create the 'test' process with some arguments 'args' passed to the 'test' target function - the 'test' process doesn't update the shared model but uses it on a part of it - torch.multiprocessing.Process runs a function in an independent thread
p.start() # starting the created process p
processes.append(p) # adding the created process p to the list of processes
for rank in range(0, params.num_processes): # making a loop to run all the other processes that will be trained by updating the shared model
p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
p.start()
processes.append(p)
for p in processes: # creating a pointer that will allow to kill all the threads when at least one of the threads, or main.py will be killed, allowing to stop the program safely
p.join()
# AI for Breakout
# Importing the librairies
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
# Initializing and setting the variance of a tensor of weights
def normalized_columns_initializer(weights, std=1.0):
out = torch.randn(weights.size())
out *= std / torch.sqrt(out.pow(2).sum(1, True)) # thanks to this initialization, we have var(out) = std^2
return out
# Initializing the weights of the neural network in an optimal way for the learning
def weights_init(m):
classname = m.__class__.__name__ # python trick that will look for the type of connection in the object "m" (convolution or full connection)
if classname.find('Conv') != -1: # if the connection is a convolution
weight_shape = list(m.weight.data.size()) # list containing the shape of the weights in the object "m"
fan_in = np.prod(weight_shape[1:4]) # dim1 * dim2 * dim3
fan_out = np.prod(weight_shape[2:4]) * weight_shape[0] # dim0 * dim2 * dim3
w_bound = np.sqrt(6. / (fan_in + fan_out)) # weight bound
m.weight.data.uniform_(-w_bound, w_bound) # generating some random weights of order inversely proportional to the size of the tensor of weights
m.bias.data.fill_(0) # initializing all the bias with zeros
elif classname.find('Linear') != -1: # if the connection is a full connection
weight_shape = list(m.weight.data.size()) # list containing the shape of the weights in the object "m"
fan_in = weight_shape[1] # dim1
fan_out = weight_shape[0] # dim0
w_bound = np.sqrt(6. / (fan_in + fan_out)) # weight bound
m.weight.data.uniform_(-w_bound, w_bound) # generating some random weights of order inversely proportional to the size of the tensor of weights
m.bias.data.fill_(0) # initializing all the bias with zeros
# Making the A3C brain
class ActorCritic(torch.nn.Module):
def __init__(self, num_inputs, action_space):
super(ActorCritic, self).__init__()
self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1) # first convolution
self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) # second convolution
self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) # third convolution
self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) # fourth convolution
self.lstm = nn.LSTMCell(32 * 3 * 3, 256) # making an LSTM (Long Short Term Memory) to learn the temporal properties of the input - we obtain a big encoded vector S of size 256 that encodes an event of the game
num_outputs = action_space.n # getting the number of possible actions
self.critic_linear = nn.Linear(256, 1) # full connection of the critic: output = V(S)
self.actor_linear = nn.Linear(256, num_outputs) # full connection of the actor: output = Q(S,A)
self.apply(weights_init) # initilizing the weights of the model with random weights
self.actor_linear.weight.data = normalized_columns_initializer(self.actor_linear.weight.data, 0.01) # setting the standard deviation of the actor tensor of weights to 0.01
self.actor_linear.bias.data.fill_(0) # initializing the actor bias with zeros
self.critic_linear.weight.data = normalized_columns_initializer(self.critic_linear.weight.data, 1.0) # setting the standard deviation of the critic tensor of weights to 0.01
self.critic_linear.bias.data.fill_(0) # initializing the critic bias with zeros
self.lstm.bias_ih.data.fill_(0) # initializing the lstm bias with zeros
self.lstm.bias_hh.data.fill_(0) # initializing the lstm bias with zeros
self.train() # setting the module in "train" mode to activate the dropouts and batchnorms
def forward(self, inputs):
inputs, (hx, cx) = inputs # getting separately the input images to the tuple (hidden states, cell states)
x = F.elu(self.conv1(inputs)) # forward propagating the signal from the input images to the 1st convolutional layer
x = F.elu(self.conv2(x)) # forward propagating the signal from the 1st convolutional layer to the 2nd convolutional layer
x = F.elu(self.conv3(x)) # forward propagating the signal from the 2nd convolutional layer to the 3rd convolutional layer
x = F.elu(self.conv4(x)) # forward propagating the signal from the 3rd convolutional layer to the 4th convolutional layer
x = x.view(-1, 32 * 3 * 3) # flattening the last convolutional layer into this 1D vector x
hx, cx = self.lstm(x, (hx, cx)) # the LSTM takes as input x and the old hidden & cell states and ouputs the new hidden & cell states
x = hx # getting the useful output, which are the hidden states (principle of the LSTM)
return self.critic_linear(x), self.actor_linear(x), (hx, cx) # returning the output of the critic (V(S)), the output of the actor (Q(S,A)), and the new hidden & cell states ((hx, cx))
# Optimizer
import math
import torch
import torch.optim as optim
# Implementing the Adam optimizer with shared states
class SharedAdam(optim.Adam): # object that inherits from optim.Adam
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) # inheriting from the tools of optim.Adam
for group in self.param_groups: # self.param_groups contains all the attributes of the optimizer, including the parameters to optimize (the weights of the network) contained in self.param_groups['params']
for p in group['params']: # for each tensor p of weights to optimize
state = self.state[p] # at the beginning, self.state is an empty dictionary so state = {} and self.state = {p:{}} = {p: state}
state['step'] = torch.zeros(1) # counting the steps: state = {'step' : tensor([0])}
state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() # the update of the adam optimizer is based on an exponential moving average of the gradient (moment 1)
state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() # the update of the adam optimizer is also based on an exponential moving average of the squared of the gradient (moment 2)
# Sharing the memory
def share_memory(self):
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'].share_memory_() # tensor.share_memory_() acts a little bit like tensor.cuda()
state['exp_avg'].share_memory_() # tensor.share_memory_() acts a little bit like tensor.cuda()
state['exp_avg_sq'].share_memory_() # tensor.share_memory_() acts a little bit like tensor.cuda()
# Performing a single optimization step of the Adam algorithm (see algorithm 1 in https://arxiv.org/pdf/1412.6980.pdf)
def step(self):
loss = None
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step'][0]
bias_correction2 = 1 - beta2 ** state['step'][0]
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
return loss
# Test Agent
import torch
import torch.nn.functional as F
from envs import create_atari_env
from model import ActorCritic
from torch.autograd import Variable
import time
from collections import deque
# Making the test agent (won't update the model but will just use the shared model to explore)
def test(rank, params, shared_model):
torch.manual_seed(params.seed + rank) # asynchronizing the test agent
env = create_atari_env(params.env_name, video=True) # running an environment with a video
env.seed(params.seed + rank) # asynchronizing the environment
model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating one model
model.eval() # putting the model in "eval" model because it won't be trained
state = env.reset() # getting the input images as numpy arrays
state = torch.from_numpy(state) # converting them into torch tensors
reward_sum = 0 # initializing the sum of rewards to 0
done = True # initializing done to True
start_time = time.time() # getting the starting time to measure the computation time
actions = deque(maxlen=100) # cf https://pymotw.com/2/collections/deque.html
episode_length = 0 # initializing the episode length to 0
while True: # repeat
episode_length += 1 # incrementing the episode length by one
if done: # synchronizing with the shared model (same as train.py)
model.load_state_dict(shared_model.state_dict())
cx = Variable(torch.zeros(1, 256), volatile=True)
hx = Variable(torch.zeros(1, 256), volatile=True)
else:
cx = Variable(cx.data, volatile=True)
hx = Variable(hx.data, volatile=True)
value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
prob = F.softmax(action_value)
action = prob.max(1)[1].data.numpy() # the test agent does not explore, it directly plays the best action
state, reward, done, _ = env.step(action[0]) # done = done or episode_length >= params.max_episode_length
reward_sum += reward
if done: # printing the results at the end of each part
print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length))
reward_sum = 0 # reinitializing the sum of rewards
episode_length = 0 # reinitializing the episode length
actions.clear() # reinitializing the actions
state = env.reset() # reinitializing the environment
time.sleep(60) # doing a one minute break to let the other agents practice (if the game is done)
state = torch.from_numpy(state) # new state and we continue
{"timestamps": [1497474838.625608, 1497474900.764579, 1497474962.78539, 1497475149.929659, 1497475341.75359, 1497475403.907666, 1497475466.050186, 1497475527.984195, 1497475590.903063, 1497475778.065657, 1497475966.907116, 1497476153.816722, 1497476350.714945, 1497476414.613782, 1497476607.331991, 1497476671.648373, 1497476857.754195, 1497477045.688467, 1497477231.658886, 1497477430.295778, 1497477492.995207, 1497477680.911942, 1497477743.918765, 1497477930.515772, 1497477998.119777, 1497478066.323782, 1497478133.449904, 1497478199.332776, 1497478266.758708, 1497478331.483278, 1497478529.809023, 1497478599.229024, 1497478665.916518, 1497478735.296724, 1497478803.2471, 1497478870.462308, 1497478938.856184, 1497479009.062664, 1497479075.115077, 1497479164.611938, 1497479230.299145, 1497479421.021075, 1497479488.970275, 1497479552.508607, 1497479617.872668, 1497479806.611224, 1497479995.293183, 1497480182.079616, 1497480250.668939, 1497480317.122499, 1497480388.503761, 1497480453.280342, 1497480517.455752, 1497480583.618936, 1497480651.447811, 1497480718.334983, 1497482503.635591, 1497485748.693864, 1497485828.91551, 1497488460.129539, 1497491567.669423, 1497494813.144901, 1497494888.968478, 1497498074.744378, 1497498156.331747, 1497501334.672234, 1497501405.108372, 1497504600.709155, 1497504677.798919, 1497507868.457328, 1497507945.812659, 1497510907.987191, 1497510974.541058, 1497511043.75848, 1497511115.726684, 1497511184.109168, 1497511253.055375, 1497511325.15296, 1497511394.197304, 1497511465.77898, 1497511536.506502, 1497511612.095709, 1497511686.699079, 1497511758.460545, 1497511833.149438, 1497511905.044883, 1497511979.728534, 1497512051.736013, 1497512122.124992, 1497512192.650718, 1497512267.924831, 1497512339.514782, 1497512410.833858, 1497512488.907089, 1497512561.054158, 1497512639.570056, 1497512714.017086, 1497512786.701552, 1497512858.560962, 1497512929.528042, 1497512997.052131, 1497513076.340909, 1497513152.419017, 1497513230.862626, 1497513305.922902, 1497513385.088462, 1497513457.895777, 1497513530.780006, 1497513609.412855, 1497513682.406129, 1497513758.643327, 1497513837.961773, 1497513912.195992, 1497513984.7671, 1497514057.454018, 1497514134.096195, 1497514206.673027, 1497514280.674243, 1497514356.701312, 1497514432.553663, 1497514504.220083, 1497514575.885522, 1497514651.445447, 1497514729.73671, 1497514813.80632, 1497514890.34009, 1497514973.619363, 1497515047.079469, 1497515120.003953, 1497516964.8621, 1497518824.138026, 1497518899.920619, 1497520755.251322, 1497520897.606823, 1497520972.689283, 1497521048.454554, 1497521119.967964, 1497521197.672301, 1497521268.629531, 1497521347.57253, 1497521420.977261, 1497521495.742966, 1497521574.628306, 1497521648.989806, 1497521720.840984, 1497521801.450775, 1497521881.879115, 1497521965.054372, 1497522039.927412, 1497522119.655763, 1497522196.277732, 1497522287.974334, 1497522367.024164, 1497522439.685895, 1497522519.064411, 1497522603.760937, 1497522676.520156, 1497522750.342543, 1497522827.620407, 1497522907.644358, 1497522986.555456, 1497523069.865005, 1497523147.819061, 1497523228.776335, 1497523310.779176, 1497523390.411285, 1497523467.466662, 1497523554.3851, 1497523626.178604, 1497523710.902567, 1497523796.314991, 1497523875.459191, 1497523956.418871, 1497524044.643863, 1497524127.264616, 1497524203.976316, 1497524289.230321, 1497524372.667135, 1497524451.499831, 1497524527.803252, 1497524611.871406, 1497524688.231612, 1497524758.694489, 1497524840.277231, 1497524919.439067, 1497524998.526252, 1497525087.344646, 1497525167.858968, 1497525246.663409, 1497525328.823342, 1497525415.336177, 1497525496.248604, 1497525577.49803, 1497525659.622651, 1497525735.535838, 1497525812.593749, 1497525895.919639, 1497525970.552508, 1497526047.957082, 1497526127.87573, 1497526209.592878, 1497526290.938023, 1497526367.901449, 1497526452.461498, 1497526534.051435, 1497526610.30394, 1497526689.929812, 1497526773.494307, 1497526849.468455, 1497526931.792018, 1497527016.855546, 1497527101.791257, 1497527182.237386, 1497527268.324452, 1497527349.069708, 1497527432.487799, 1497527523.656435, 1497527603.614958, 1497527685.233796, 1497527871.738331, 1497527949.830844, 1497528030.286451, 1497528115.320022, 1497528191.084883, 1497528276.136999, 1497528357.856767, 1497528439.403549, 1497528522.188319, 1497528610.307138, 1497528694.684263, 1497528769.760321, 1497528847.73313, 1497528927.247152, 1497528999.667958, 1497529074.934359, 1497529165.074753, 1497529244.797828, 1497529316.195247, 1497529503.63955, 1497529582.151106, 1497529664.236322, 1497529743.935488, 1497529829.571816, 1497529912.832294, 1497529991.737362, 1497530073.716307, 1497530155.539309, 1497530233.724801, 1497530313.002672, 1497530393.458363, 1497530480.47093, 1497530560.941567, 1497530644.727342, 1497530729.544386, 1497530816.002151, 1497530898.99281, 1497530988.074273, 1497531069.939651, 1497531150.617352, 1497531229.368019, 1497531312.327822, 1497531386.436073, 1497531471.745537, 1497531558.500785, 1497531644.798181, 1497531720.912677, 1497531805.780982, 1497531885.650736, 1497531969.110702, 1497533798.749066, 1497533884.296739, 1497533970.716533, 1497534057.407973, 1497534146.291083, 1497534227.71556, 1497534305.705014, 1497534391.203825, 1497534470.784589, 1497534554.366182, 1497534634.191658, 1497534713.586626, 1497534796.550327, 1497534880.997726, 1497534966.475726, 1497535048.344213, 1497535137.101273, 1497535219.345514, 1497535301.988087, 1497535386.148086, 1497535469.946905, 1497535555.549371, 1497535637.973007, 1497535728.871654, 1497535807.144276, 1497535902.552935, 1497535991.449845, 1497536081.445065, 1497536166.036504, 1497536250.338319, 1497536336.678974, 1497536424.133922, 1497536512.721328, 1497536599.357947, 1497536678.940411, 1497536762.783639, 1497536843.697335, 1497536921.024644, 1497537005.153102, 1497537081.254402, 1497537167.384946, 1497537250.754219, 1497537329.233754, 1497537417.593308, 1497537498.494686, 1497537588.581231, 1497537675.405, 1497537757.94362, 1497537844.20215, 1497537929.199078, 1497538013.355236, 1497538100.798037, 1497538182.693836, 1497538265.830326, 1497538347.368015, 1497538431.055388, 1497538514.1086, 1497538604.0088, 1497538685.160326, 1497538766.945111, 1497538850.305383, 1497538936.374253, 1497539018.270406, 1497539099.363212, 1497539180.582296, 1497539263.006884, 1497539344.158386, 1497539429.557793, 1497539515.700641, 1497539609.163851, 1497539697.833884, 1497539777.897959, 1497539865.55695, 1497539953.52782, 1497540038.630353, 1497540128.409797], "initial_reset_timestamp": 1497474837.225722, "episode_types": ["t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t"], "episode_lengths": [152, 167, 161, 10000, 10000, 172, 166, 166, 165, 10000, 10000, 10000, 10000, 261, 10000, 352, 10000, 10000, 10000, 10000, 237, 10000, 246, 10000, 610, 538, 496, 342, 557, 353, 10000, 726, 540, 748, 567, 557, 626, 768, 477, 2206, 459, 10000, 600, 287, 415, 10000, 10000, 10000, 706, 494, 901, 366, 316, 514, 657, 545, 1022, 797, 805, 472, 996, 737, 872, 707, 1103, 474, 627, 1224, 996, 914, 1026, 969, 474, 746, 971, 664, 731, 940, 682, 922, 809, 1275, 1077, 904, 1135, 940, 1163, 924, 807, 859, 1201, 884, 929, 1429, 962, 1394, 1132, 981, 973, 894, 595, 1505, 1271, 1360, 1105, 1507, 949, 1003, 1421, 954, 1208, 1526, 1123, 1008, 1012, 1300, 947, 1096, 1270, 1213, 949, 921, 1179, 1469, 1892, 1096, 1800, 1092, 1007, 696, 1183, 929, 1249, 1067, 1160, 1329, 864, 1302, 807, 1329, 1051, 1021, 1443, 992, 837, 1506, 1405, 1808, 1101, 1445, 1283, 2316, 1358, 890, 1433, 1700, 942, 1011, 1288, 1633, 1464, 1835, 1434, 1579, 1569, 1435, 1261, 1837, 864, 1821, 1985, 1503, 1637, 2093, 1667, 1175, 1861, 1683, 1474, 1160, 1713, 1294, 751, 1585, 1399, 1434, 1982, 1436, 1443, 1661, 1993, 1635, 1673, 1722, 1280, 1332, 1776, 1134, 1320, 1488, 1716, 1661, 1310, 1948, 1698, 1271, 1451, 1845, 1251, 1739, 1885, 1930, 1537, 2027, 1599, 1829, 2088, 1587, 1675, 10000, 1418, 1587, 1936, 1223, 1977, 1725, 1683, 1813, 2237, 1905, 1211, 1426, 1525, 977, 1233, 2340, 1575, 893, 10000, 1438, 1710, 1480, 2054, 1789, 1433, 1731, 1809, 1421, 1467, 1624, 2083, 1610, 1899, 1915, 2091, 1780, 2291, 1700, 1612, 1505, 1767, 1161, 1978, 2073, 2027, 1295, 1921, 1543, 1827, 966, 2007, 1935, 1928, 2060, 1522, 1298, 1899, 1425, 1717, 1562, 1419, 1690, 1814, 1944, 1670, 2259, 1647, 1781, 1717, 1713, 1884, 1636, 2274, 1271, 2466, 2005, 2166, 1778, 1770, 2004, 1973, 2110, 1992, 1435, 1755, 1524, 1289, 1681, 1216, 2137, 1797, 1423, 2231, 1600, 2302, 2067, 1713, 2037, 1944, 1942, 2147, 1715, 1830, 1522, 1651, 1559, 2200, 1641, 1706, 1760, 1905, 1687, 1620, 1673, 1728, 1643, 1949, 2075, 2443, 2198, 1545, 1995, 2014, 1655, 2155], "episode_rewards": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 4.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 4.0, 9.0, 11.0, 7.0, 4.0, 15.0, 4.0, 2.0, 10.0, 7.0, 12.0, 6.0, 9.0, 12.0, 12.0, 7.0, 14.0, 6.0, 6.0, 9.0, 3.0, 4.0, 15.0, 8.0, 13.0, 10.0, 7.0, 14.0, 8.0, 4.0, 7.0, 10.0, 8.0, 18.0, 13.0, 13.0, 7.0, 21.0, 13.0, 14.0, 12.0, 27.0, 7.0, 11.0, 31.0, 22.0, 17.0, 17.0, 23.0, 7.0, 13.0, 22.0, 10.0, 23.0, 24.0, 10.0, 20.0, 14.0, 36.0, 20.0, 40.0, 22.0, 19.0, 28.0, 16.0, 14.0, 15.0, 27.0, 22.0, 22.0, 36.0, 16.0, 39.0, 39.0, 29.0, 21.0, 16.0, 9.0, 36.0, 30.0, 28.0, 25.0, 42.0, 26.0, 22.0, 38.0, 37.0, 34.0, 38.0, 20.0, 25.0, 27.0, 24.0, 32.0, 28.0, 30.0, 24.0, 19.0, 19.0, 36.0, 43.0, 54.0, 30.0, 53.0, 28.0, 24.0, 12.0, 32.0, 16.0, 34.0, 29.0, 41.0, 41.0, 15.0, 28.0, 25.0, 37.0, 22.0, 34.0, 44.0, 19.0, 28.0, 47.0, 40.0, 54.0, 24.0, 48.0, 38.0, 90.0, 48.0, 22.0, 36.0, 49.0, 17.0, 23.0, 40.0, 58.0, 61.0, 68.0, 38.0, 40.0, 48.0, 42.0, 41.0, 69.0, 19.0, 63.0, 63.0, 64.0, 57.0, 71.0, 63.0, 34.0, 60.0, 47.0, 43.0, 24.0, 59.0, 33.0, 12.0, 47.0, 62.0, 45.0, 68.0, 56.0, 42.0, 58.0, 84.0, 53.0, 52.0, 79.0, 40.0, 46.0, 86.0, 34.0, 37.0, 41.0, 61.0, 63.0, 42.0, 80.0, 59.0, 28.0, 57.0, 77.0, 48.0, 64.0, 60.0, 72.0, 63.0, 205.0, 45.0, 67.0, 78.0, 51.0, 64.0, 79.0, 43.0, 58.0, 69.0, 29.0, 56.0, 220.0, 71.0, 77.0, 84.0, 107.0, 75.0, 54.0, 65.0, 34.0, 145.0, 109.0, 50.0, 25.0, 73.0, 41.0, 67.0, 60.0, 64.0, 50.0, 70.0, 57.0, 97.0, 38.0, 50.0, 42.0, 112.0, 53.0, 65.0, 80.0, 106.0, 53.0, 255.0, 54.0, 63.0, 33.0, 75.0, 32.0, 81.0, 72.0, 80.0, 53.0, 66.0, 51.0, 72.0, 25.0, 90.0, 74.0, 73.0, 108.0, 58.0, 40.0, 80.0, 42.0, 83.0, 46.0, 65.0, 76.0, 71.0, 76.0, 66.0, 113.0, 75.0, 78.0, 53.0, 61.0, 251.0, 71.0, 93.0, 48.0, 132.0, 90.0, 89.0, 90.0, 91.0, 69.0, 247.0, 71.0, 74.0, 47.0, 75.0, 57.0, 62.0, 73.0, 39.0, 114.0, 60.0, 72.0, 119.0, 59.0, 110.0, 106.0, 75.0, 95.0, 65.0, 83.0, 98.0, 56.0, 62.0, 58.0, 62.0, 76.0, 108.0, 96.0, 61.0, 55.0, 72.0, 64.0, 61.0, 87.0, 73.0, 104.0, 66.0, 76.0, 102.0, 95.0, 143.0, 65.0, 89.0, 60.0, 98.0]}
\ No newline at end of file
{"env_info": {"env_id": "Breakout-v0", "gym_version": "0.9.1"}, "stats": "openaigym.episode_batch.0.4262.stats.json", "videos": [["openaigym.video.0.4262.video000000.mp4", "openaigym.video.0.4262.video000000.meta.json"], ["openaigym.video.0.4262.video000001.mp4", "openaigym.video.0.4262.video000001.meta.json"], ["openaigym.video.0.4262.video000008.mp4", "openaigym.video.0.4262.video000008.meta.json"], ["openaigym.video.0.4262.video000027.mp4", "openaigym.video.0.4262.video000027.meta.json"], ["openaigym.video.0.4262.video000064.mp4", "openaigym.video.0.4262.video000064.meta.json"], ["openaigym.video.0.4262.video000125.mp4", "openaigym.video.0.4262.video000125.meta.json"], ["openaigym.video.0.4262.video000216.mp4", "openaigym.video.0.4262.video000216.meta.json"], ["openaigym.video.0.4262.video000343.mp4", "openaigym.video.0.4262.video000343.meta.json"]]}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000000.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 0}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000001.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 1}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000008.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 8}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000027.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 27}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000064.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 64}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000125.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 125}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000216.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 216}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000343.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 343}
\ No newline at end of file
# Training the AI
import torch
import torch.nn.functional as F
from envs import create_atari_env
from model import ActorCritic
from torch.autograd import Variable
# Implementing a function to make sure the models share the same gradient
def ensure_shared_grads(model, shared_model):
for param, shared_param in zip(model.parameters(), shared_model.parameters()):
if shared_param.grad is not None:
return
shared_param._grad = param.grad
def train(rank, params, shared_model, optimizer):
torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent
env = create_atari_env(params.env_name) # creating an optimized environment thanks to the create_atari_env function
env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent
model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating the model from the ActorCritic class
state = env.reset() # state is a numpy array of size 1*42*42, in black & white
state = torch.from_numpy(state) # converting the numpy array into a torch tensor
done = True # when the game is done
episode_length = 0 # initializing the length of an episode to 0
while True: # repeat
episode_length += 1 # incrementing the episode length by one
model.load_state_dict(shared_model.state_dict()) # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps
if done: # if it is the first iteration of the while loop or if the game was just done, then:
cx = Variable(torch.zeros(1, 256)) # the cell states of the LSTM are reinitialized to zero
hx = Variable(torch.zeros(1, 256)) # the hidden states of the LSTM are reinitialized to zero
else: # else:
cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable
hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable
values = [] # initializing the list of values (V(S))
log_probs = [] # initializing the list of log probabilities
rewards = [] # initializing the list of rewards
entropies = [] # initializing the list of entropies
for step in range(params.num_steps): # going through the num_steps exploration steps
value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states
prob = F.softmax(action_values) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b)))
log_prob = F.log_softmax(action_values) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a))
entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x))
entropies.append(entropy) # storing the computed entropy
action = prob.multinomial().data # selecting an action by taking a random draw from the prob distribution
log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action
values.append(value) # storing the value V(S) of the state
log_probs.append(log_prob) # storing the log prob of the action
state, reward, done, _ = env.step(action.numpy()) # playing the selected action, reaching the new state, and getting the new reward
done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done
reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1
if done: # if the episode is done:
episode_length = 0 # we restart the environment
state = env.reset() # we restart the environment
state = torch.from_numpy(state) # tensorizing the new state
rewards.append(reward) # storing the new observed reward
if done: # if we are done
break # we stop the exploration and we directly move on to the next step: the update of the shared model
R = torch.zeros(1, 1) # intializing the cumulative reward
if not done: # if we are not done:
value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) # we initialize the cumulative reward with the value of the last shared state
R = value.data # we initialize the cumulative reward with the value of the last shared state
values.append(Variable(R)) # storing the value V(S) of the last reached state S
policy_loss = 0 # initializing the policy loss
value_loss = 0 # initializing the value loss
R = Variable(R) # making sure the cumulative reward R is a torch Variable
gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0
for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time
R = params.gamma * R + rewards[i] # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state)
advantage = R - values[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i]
value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss
TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data # computing the temporal difference
gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamma*tau)^i * TD(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i))
policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss
optimizer.zero_grad() # initializing the optimizer
(policy_loss + 0.5 * value_loss).backward() # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller
torch.nn.utils.clip_grad_norm(model.parameters(), 40) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm
ensure_shared_grads(model, shared_model) # making sure the model of the agent and the shared model share the same gradient
optimizer.step() # running the optimization step
Ce repo contient les projets implémentés pendant le cours [Intelligence Artificielle de A à Z](https://www.udemy.com/intelligence-artificielle-az/?couponCode=WEBSITE) en utilisant `PyTorch`.
1. [Installation des modules](#installation-des-modules)
1. [Installation des modules - Self Driving Car](#installation-des-modules---self-driving-car)
1. [Installation de PyTorch](#installation-de-pytorch)
2. [Installation de Kivy](#installation-de-kivy)
3. [Tester votre installation](#tester-votre-installation)
4. [Installer OpenAI Gym](#installer-openai-gym)
5. [Valider votre installation](#valider-votre-installation)
2. [F.A.Q](#faq)
2. [Installation des modules - Doom](#installation-des-modules---doom)
1. [Installer OpenAI Gym](#installer-openai-gym)
2. [Valider votre installation](#valider-votre-installation)
3. [Installation des modules - Breakout](#installation-des-modules---breakout)
1. [Installer un nouvel environnement](#installer-un-nouvel-environnement)
4. [F.A.Q](#faq)
1. [pip3 is not recognized as an internal or external command](#pip3-is-not-recognized-as-an-internal-or-external-command)
2. [distributed 1.22.0 requires msgpack, which is not installed](#distributed-1220-requires-msgpack-which-is-not-installed)
3. [tensorflow 1.9.0 has requirement setuptools=39.1.0](#tensorflow-190-has-requirement-setuptools3910)
......@@ -15,7 +18,7 @@ Ce repo contient les projets implémentés pendant le cours [Intelligence Artifi
6. [No module named 'kivy'](#no-module-named-kivy)
7. [No module named 'matplotlib'](#no-module-named-matplotlib)
## Installation des modules
## Installation des modules - Self Driving Car
Après avoir installé [Anaconda](https://anaconda.org/), suivre les instructions suivantes :
......@@ -169,6 +172,8 @@ Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
```
## Installation des modules - Doom
### Installer OpenAI Gym
`OpenAI Gym` est à installer pour les parties 2 et 3 du cours.
......@@ -218,6 +223,28 @@ Vous devriez ne pas avoir de message d'erreur.
Si vous en obtenez un, retentez les étapes ci-dessus et regardez la FAQ ci-dessous pour obtenir de l'aide.
## Installation des modules - Breakout
### Installer un nouvel environnement
Pour Breakout, on va devoir créer un nouvel environnement Anaconda sous Python 2.7 afin de s'assurer de la compatibilité avec les modules utilisés.
Dans la console, sous Mac ou Ubuntu :
```
conda create -n breakout python=2.7
conda activate breakout
conda install spyder
conda install -c pytorch pytorch-cpu=0.3.1
pip install gym==0.7.4
conda install -c menpo opencv=2.4.11
pip install atari-py==0.0.21
conda install -c conda-forge ffmpeg=3.2.4
spyder
```
**Note** : Assurez-vous bien de toujours activer l'environnement (`conda activate breakout`) avant de lancer `spyder`.
## F.A.Q.
### pip3 is not recognized as an internal or external command
......