Commit 5656ea9b authored by Charles's avatar Charles

add project 3 breakout

parent ab664dd5
Pipeline #53 canceled with stages
# Improvement of the Gym environment with universe
import cv2
import gym
import numpy as np
from gym.spaces.box import Box
from gym import wrappers
# Taken from https://github.com/openai/universe-starter-agent
def create_atari_env(env_id, video=False):
env = gym.make(env_id)
if video:
env = wrappers.Monitor(env, 'test', force=True)
env = MyAtariRescale42x42(env)
env = MyNormalizedEnv(env)
return env
def _process_frame42(frame):
frame = frame[34:34 + 160, :160]
# Resize by half, then down to 42x42 (essentially mipmapping). If
# we resize directly we lose pixels that, when mapped to 42x42,
# aren't close enough to the pixel boundary.
frame = cv2.resize(frame, (80, 80))
frame = cv2.resize(frame, (42, 42))
frame = frame.mean(2)
frame = frame.astype(np.float32)
frame *= (1.0 / 255.0)
#frame = np.reshape(frame, [1, 42, 42])
return frame
class MyAtariRescale42x42(gym.ObservationWrapper):
def __init__(self, env=None):
super(MyAtariRescale42x42, self).__init__(env)
self.observation_space = Box(0.0, 1.0, [1, 42, 42])
def _observation(self, observation):
return _process_frame42(observation)
class MyNormalizedEnv(gym.ObservationWrapper):
def __init__(self, env=None):
super(MyNormalizedEnv, self).__init__(env)
self.state_mean = 0
self.state_std = 0
self.alpha = 0.9999
self.num_steps = 0
def _observation(self, observation):
self.num_steps += 1
self.state_mean = self.state_mean * self.alpha + \
observation.mean() * (1 - self.alpha)
self.state_std = self.state_std * self.alpha + \
observation.std() * (1 - self.alpha)
unbiased_mean = self.state_mean / (1 - pow(self.alpha, self.num_steps))
unbiased_std = self.state_std / (1 - pow(self.alpha, self.num_steps))
ret = (observation - unbiased_mean) / (unbiased_std + 1e-8)
return np.expand_dims(ret, axis=0)
# Main code
from __future__ import print_function
import os
import torch
import torch.multiprocessing as mp
from envs import create_atari_env
from model import ActorCritic
from train import train
from test import test
import my_optim
# Gathering all the parameters (that we can modify to explore)
class Params():
def __init__(self):
self.lr = 0.0001
self.gamma = 0.99
self.tau = 1.
self.seed = 1
self.num_processes = 16
self.num_steps = 20
self.max_episode_length = 10000
self.env_name = 'Breakout-v0'
# Main run
os.environ['OMP_NUM_THREADS'] = '1'
params = Params()
torch.manual_seed(params.seed)
env = create_atari_env(params.env_name)
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
shared_model.share_memory()
optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr)
optimizer.share_memory()
processes = []
p = mp.Process(target=test, args=(params.num_processes, params, shared_model))
p.start()
processes.append(p)
for rank in range(0, params.num_processes):
p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
p.start()
processes.append(p)
for p in processes:
p.join()
# AI for Breakout
# Importing the librairies
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
# Initializing and setting the variance of a tensor of weights
def normalized_columns_initializer(weights, std=1.0):
out = torch.randn(weights.size())
out *= std / torch.sqrt(out.pow(2).sum(1, True))
return out
# Initializing the weights of the neural network in an optimal way for the learning
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
weight_shape = list(m.weight.data.size())
fan_in = np.prod(weight_shape[1:4])
fan_out = np.prod(weight_shape[2:4]) * weight_shape[0]
w_bound = np.sqrt(6. / (fan_in + fan_out))
m.weight.data.uniform_(-w_bound, w_bound)
m.bias.data.fill_(0)
elif classname.find('Linear') != -1:
weight_shape = list(m.weight.data.size())
fan_in = weight_shape[1]
fan_out = weight_shape[0]
w_bound = np.sqrt(6. / (fan_in + fan_out))
m.weight.data.uniform_(-w_bound, w_bound)
m.bias.data.fill_(0)
# Making the A3C brain
class ActorCritic(torch.nn.Module):
def __init__(self, num_inputs, action_space):
super(ActorCritic, self).__init__()
self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1)
self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
self.lstm = nn.LSTMCell(32 * 3 * 3, 256)
num_outputs = action_space.n
self.critic_linear = nn.Linear(256, 1)
self.actor_linear = nn.Linear(256, num_outputs)
self.apply(weights_init)
self.actor_linear.weight.data = normalized_columns_initializer(self.actor_linear.weight.data, 0.01)
self.actor_linear.bias.data.fill_(0)
self.critic_linear.weight.data = normalized_columns_initializer(self.critic_linear.weight.data, 1.0)
self.critic_linear.bias.data.fill_(0)
self.lstm.bias_ih.data.fill_(0)
self.lstm.bias_hh.data.fill_(0)
self.train()
def forward(self, inputs):
inputs, (hx, cx) = inputs
x = F.elu(self.conv1(inputs))
x = F.elu(self.conv2(x))
x = F.elu(self.conv3(x))
x = F.elu(self.conv4(x))
x = x.view(-1, 32 * 3 * 3)
hx, cx = self.lstm(x, (hx, cx))
x = hx
return self.critic_linear(x), self.actor_linear(x), (hx, cx)
# Optimizer
import math
import torch
import torch.optim as optim
class SharedAdam(optim.Adam):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'] = torch.zeros(1)
state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
def share_memory(self):
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'].share_memory_()
state['exp_avg'].share_memory_()
state['exp_avg_sq'].share_memory_()
def step(self):
loss = None
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step'][0]
bias_correction2 = 1 - beta2 ** state['step'][0]
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
return loss
# Test Agent
import torch
import torch.nn.functional as F
from envs import create_atari_env
from model import ActorCritic
from torch.autograd import Variable
import time
from collections import deque
def test(rank, params, shared_model):
torch.manual_seed(params.seed + rank)
env = create_atari_env(params.env_name, video=True)
env.seed(params.seed + rank)
model = ActorCritic(env.observation_space.shape[0], env.action_space)
model.eval()
state = env.reset()
state = torch.from_numpy(state)
reward_sum = 0
done = True
start_time = time.time()
actions = deque(maxlen=100)
episode_length = 0
while True:
episode_length += 1
if done:
model.load_state_dict(shared_model.state_dict())
cx = Variable(torch.zeros(1, 256), volatile=True)
hx = Variable(torch.zeros(1, 256), volatile=True)
else:
cx = Variable(cx.data, volatile=True)
hx = Variable(hx.data, volatile=True)
value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
prob = F.softmax(action_value)
action = prob.max(1)[1].data.numpy()
state, reward, done, _ = env.step(action[0])
reward_sum += reward
if done:
print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length))
reward_sum = 0
episode_length = 0
actions.clear()
state = env.reset()
time.sleep(60)
state = torch.from_numpy(state)
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000000.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 0}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000001.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 1}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000008.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 8}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000027.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 27}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000064.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 64}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000125.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 125}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000216.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 216}
\ No newline at end of file
{"encoder_version": {"cmdline": ["ffmpeg", "-nostats", "-loglevel", "error", "-y", "-r", "30", "-f", "rawvideo", "-s:v", "160x210", "-pix_fmt", "rgb24", "-i", "-", "-vcodec", "libx264", "-pix_fmt", "yuv420p", "/Users/Hadelin/Documents/Udemy/Teaching/AI A-Z/Module 3 - Breakout/Breakout OpenAI/Breakout_A3C/test/openaigym.video.0.4262.video000343.mp4"], "version": "ffmpeg version 3.2.4 Copyright (c) 2000-2017 the FFmpeg developers\nbuilt with Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)\nconfiguration: --prefix=/Users/Hadelin/anaconda --disable-doc --enable-shared --enable-static --extra-cflags='-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC -I/Users/Hadelin/anaconda/include' --extra-cxxflags='=-Wall -g -m64 -pipe -O3 -march=x86-64 -fPIC' --extra-libs='-L/Users/Hadelin/anaconda/lib -lz' --enable-pic --enable-gpl --enable-version3 --enable-hardcoded-tables --enable-avresample --enable-libx264\nlibavutil 55. 34.101 / 55. 34.101\nlibavcodec 57. 64.101 / 57. 64.101\nlibavformat 57. 56.101 / 57. 56.101\nlibavdevice 57. 1.100 / 57. 1.100\nlibavfilter 6. 65.100 / 6. 65.100\nlibavresample 3. 1. 0 / 3. 1. 0\nlibswscale 4. 2.100 / 4. 2.100\nlibswresample 2. 3.100 / 2. 3.100\nlibpostproc 54. 1.100 / 54. 1.100\n", "backend": "ffmpeg"}, "content_type": "video/mp4", "episode_id": 343}
\ No newline at end of file
# Training the AI
import torch
import torch.nn.functional as F
from envs import create_atari_env
from model import ActorCritic
from torch.autograd import Variable
def ensure_shared_grads(model, shared_model):
for param, shared_param in zip(model.parameters(), shared_model.parameters()):
if shared_param.grad is not None:
return
shared_param._grad = param.grad
def train(rank, params, shared_model, optimizer):
torch.manual_seed(params.seed + rank)
env = create_atari_env(params.env_name)
env.seed(params.seed + rank)
model = ActorCritic(env.observation_space.shape[0], env.action_space)
state = env.reset()
state = torch.from_numpy(state)
done = True
episode_length = 0
while True:
episode_length += 1
model.load_state_dict(shared_model.state_dict())
if done:
cx = Variable(torch.zeros(1, 256))
hx = Variable(torch.zeros(1, 256))
else:
cx = Variable(cx.data)
hx = Variable(hx.data)
values = []
log_probs = []
rewards = []
entropies = []
for step in range(params.num_steps):
value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
prob = F.softmax(action_values)
log_prob = F.log_softmax(action_values)
entropy = -(log_prob * prob).sum(1)
entropies.append(entropy)
action = prob.multinomial().data
log_prob = log_prob.gather(1, Variable(action))
values.append(value)
log_probs.append(log_prob)
state, reward, done, _ = env.step(action.numpy())
done = (done or episode_length >= params.max_episode_length)
reward = max(min(reward, 1), -1)
if done:
episode_length = 0
state = env.reset()
state = torch.from_numpy(state)
rewards.append(reward)
if done:
break
R = torch.zeros(1, 1)
if not done:
value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
R = value.data
values.append(Variable(R))
policy_loss = 0
value_loss = 0
R = Variable(R)
gae = torch.zeros(1, 1)
for i in reversed(range(len(rewards))):
R = params.gamma * R + rewards[i]
advantage = R - values[i]
value_loss = value_loss + 0.5 * advantage.pow(2)
TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
gae = gae * params.gamma * params.tau + TD
policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i]
optimizer.zero_grad()
(policy_loss + 0.5 * value_loss).backward()
torch.nn.utils.clip_grad_norm(model.parameters(), 40)
ensure_shared_grads(model, shared_model)
optimizer.step()
# Improvement of the Gym environment with universe
import cv2
import gym
import numpy as np
from gym.spaces.box import Box
from gym import wrappers
# Taken from https://github.com/openai/universe-starter-agent
def create_atari_env(env_id, video=False):
env = gym.make(env_id)
if video:
env = wrappers.Monitor(env, 'test', force=True)
env = MyAtariRescale42x42(env)
env = MyNormalizedEnv(env)
return env
def _process_frame42(frame):
frame = frame[34:34 + 160, :160]
# Resize by half, then down to 42x42 (essentially mipmapping). If
# we resize directly we lose pixels that, when mapped to 42x42,
# aren't close enough to the pixel boundary.
frame = cv2.resize(frame, (80, 80))
frame = cv2.resize(frame, (42, 42))
frame = frame.mean(2)
frame = frame.astype(np.float32)
frame *= (1.0 / 255.0)
#frame = np.reshape(frame, [1, 42, 42])
return frame
class MyAtariRescale42x42(gym.ObservationWrapper):
def __init__(self, env=None):
super(MyAtariRescale42x42, self).__init__(env)
self.observation_space = Box(0.0, 1.0, [1, 42, 42])
def _observation(self, observation):
return _process_frame42(observation)
class MyNormalizedEnv(gym.ObservationWrapper):
def __init__(self, env=None):
super(MyNormalizedEnv, self).__init__(env)
self.state_mean = 0
self.state_std = 0
self.alpha = 0.9999
self.num_steps = 0
def _observation(self, observation):
self.num_steps += 1
self.state_mean = self.state_mean * self.alpha + \
observation.mean() * (1 - self.alpha)
self.state_std = self.state_std * self.alpha + \
observation.std() * (1 - self.alpha)
unbiased_mean = self.state_mean / (1 - pow(self.alpha, self.num_steps))
unbiased_std = self.state_std / (1 - pow(self.alpha, self.num_steps))
ret = (observation - unbiased_mean) / (unbiased_std + 1e-8)
return np.expand_dims(ret, axis=0)
# Main code
from __future__ import print_function
import os
import torch
import torch.multiprocessing as mp
from envs import create_atari_env
from model import ActorCritic
from train import train
from test import test
import my_optim
# Gathering all the parameters (that we can modify to explore)
class Params():
def __init__(self):
self.lr = 0.0001
self.gamma = 0.99
self.tau = 1.
self.seed = 1
self.num_processes = 16
self.num_steps = 20
self.max_episode_length = 10000
self.env_name = 'Breakout-v0'
# Main run
os.environ['OMP_NUM_THREADS'] = '1' # 1 thread per core
params = Params() # creating the params object from the Params class, that sets all the model parameters
torch.manual_seed(params.seed) # setting the seed (not essential)
env = create_atari_env(params.env_name) # we create an optimized environment thanks to universe
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) # shared_model is the model shared by the different agents (different threads in different cores)
shared_model.share_memory() # storing the model in the shared memory of the computer, which allows the threads to have access to this shared memory even if they are in different cores
optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr) # the optimizer is also shared because it acts on the shared model
optimizer.share_memory() # same, we store the optimizer in the shared memory so that all the agents can have access to this shared memory to optimize the model
processes = [] # initializing the processes with an empty list
p = mp.Process(target=test, args=(params.num_processes, params, shared_model)) # allowing to create the 'test' process with some arguments 'args' passed to the 'test' target function - the 'test' process doesn't update the shared model but uses it on a part of it - torch.multiprocessing.Process runs a function in an independent thread
p.start() # starting the created process p
processes.append(p) # adding the created process p to the list of processes
for rank in range(0, params.num_processes): # making a loop to run all the other processes that will be trained by updating the shared model
p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
p.start()
processes.append(p)
for p in processes: # creating a pointer that will allow to kill all the threads when at least one of the threads, or main.py will be killed, allowing to stop the program safely
p.join()
# AI for Breakout
# Importing the librairies
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
# Initializing and setting the variance of a tensor of weights
def normalized_columns_initializer(weights, std=1.0):
out = torch.randn(weights.size())
out *= std / torch.sqrt(out.pow(2).sum(1, True)) # thanks to this initialization, we have var(out) = std^2
return out
# Initializing the weights of the neural network in an optimal way for the learning
def weights_init(m):
classname = m.__class__.__name__ # python trick that will look for the type of connection in the object "m" (convolution or full connection)
if classname.find('Conv') != -1: # if the connection is a convolution
weight_shape = list(m.weight.data.size()) # list containing the shape of the weights in the object "m"
fan_in = np.prod(weight_shape[1:4]) # dim1 * dim2 * dim3
fan_out = np.prod(weight_shape[2:4]) * weight_shape[0] # dim0 * dim2 * dim3
w_bound = np.sqrt(6. / (fan_in + fan_out)) # weight bound
m.weight.data.uniform_(-w_bound, w_bound) # generating some random weights of order inversely proportional to the size of the tensor of weights
m.bias.data.fill_(0) # initializing all the bias with zeros
elif classname.find('Linear') != -1: # if the connection is a full connection
weight_shape = list(m.weight.data.size()) # list containing the shape of the weights in the object "m"
fan_in = weight_shape[1] # dim1
fan_out = weight_shape[0] # dim0
w_bound = np.sqrt(6. / (fan_in + fan_out)) # weight bound
m.weight.data.uniform_(-w_bound, w_bound) # generating some random weights of order inversely proportional to the size of the tensor of weights
m.bias.data.fill_(0) # initializing all the bias with zeros
# Making the A3C brain
class ActorCritic(torch.nn.Module):
def __init__(self, num_inputs, action_space):
super(ActorCritic, self).__init__()
self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1) # first convolution
self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) # second convolution
self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) # third convolution
self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) # fourth convolution
self.lstm = nn.LSTMCell(32 * 3 * 3, 256) # making an LSTM (Long Short Term Memory) to learn the temporal properties of the input - we obtain a big encoded vector S of size 256 that encodes an event of the game
num_outputs = action_space.n # getting the number of possible actions
self.critic_linear = nn.Linear(256, 1) # full connection of the critic: output = V(S)
self.actor_linear = nn.Linear(256, num_outputs) # full connection of the actor: output = Q(S,A)
self.apply(weights_init) # initilizing the weights of the model with random weights
self.actor_linear.weight.data = normalized_columns_initializer(self.actor_linear.weight.data, 0.01) # setting the standard deviation of the actor tensor of weights to 0.01
self.actor_linear.bias.data.fill_(0) # initializing the actor bias with zeros
self.critic_linear.weight.data = normalized_columns_initializer(self.critic_linear.weight.data, 1.0) # setting the standard deviation of the critic tensor of weights to 0.01
self.critic_linear.bias.data.fill_(0) # initializing the critic bias with zeros
self.lstm.bias_ih.data.fill_(0) # initializing the lstm bias with zeros
self.lstm.bias_hh.data.fill_(0) # initializing the lstm bias with zeros
self.train() # setting the module in "train" mode to activate the dropouts and batchnorms
def forward(self, inputs):
inputs, (hx, cx) = inputs # getting separately the input images to the tuple (hidden states, cell states)
x = F.elu(self.conv1(inputs)) # forward propagating the signal from the input images to the 1st convolutional layer
x = F.elu(self.conv2(x)) # forward propagating the signal from the 1st convolutional layer to the 2nd convolutional layer
x = F.elu(self.conv3(x)) # forward propagating the signal from the 2nd convolutional layer to the 3rd convolutional layer
x = F.elu(self.conv4(x)) # forward propagating the signal from the 3rd convolutional layer to the 4th convolutional layer
x = x.view(-1, 32 * 3 * 3) # flattening the last convolutional layer into this 1D vector x
hx, cx = self.lstm(x, (hx, cx)) # the LSTM takes as input x and the old hidden & cell states and ouputs the new hidden & cell states
x = hx # getting the useful output, which are the hidden states (principle of the LSTM)