A3C

它会创建多个并行的环境, 让多个拥有副结构的 agent 同时在这些并行环境上更新主结构中的参数. 并行中的 agent 们互不干扰, 而主结构的参数更新受到副结构提交更新的不连续性干扰, 所以更新的相关性被降低, 收敛性提高

代码实践

 import threading
import numpy as np
import tensorflow as tf
import pylab
import time
import gym
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K # global variables for threading
episode = 0
scores = [] EPISODES = 2000 # This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole
# In this example, we use A3C algorithm
class A3CAgent:
def __init__(self, state_size, action_size, env_name):
# get size of state and action
self.state_size = state_size
self.action_size = action_size # get gym environment name
self.env_name = env_name # these are hyper parameters for the A3C
self.actor_lr = 0.001
self.critic_lr = 0.001
self.discount_factor = .99
self.hidden1, self.hidden2 = 24, 24
self.threads = 8 #8个线程并行 # create model for actor and critic network
self.actor, self.critic = self.build_model() # method for training actor and critic network
self.optimizer = [self.actor_optimizer(), self.critic_optimizer()] self.sess = tf.InteractiveSession()
K.set_session(self.sess)
self.sess.run(tf.global_variables_initializer()) # approximate policy and value using Neural Network
# actor -> state is input and probability of each action is output of network
# critic -> state is input and value of state is output of network
# actor and critic network share first hidden layer
def build_model(self):
state = Input(batch_shape=(None, self.state_size))
shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state) actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared)
action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden) value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared)
state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden) actor = Model(inputs=state, outputs=action_prob)
critic = Model(inputs=state, outputs=state_value) actor._make_predict_function()
critic._make_predict_function() actor.summary()
critic.summary() return actor, critic # make loss function for Policy Gradient
# [log(action probability) * advantages] will be input for the back prop
# we add entropy of action probability to loss
def actor_optimizer(self):
action = K.placeholder(shape=(None, self.action_size))
advantages = K.placeholder(shape=(None, )) policy = self.actor.output good_prob = K.sum(action * policy, axis=1)
eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages)
loss = -K.sum(eligibility) entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) actor_loss = loss + 0.01*entropy optimizer = Adam(lr=self.actor_lr)
updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)
train = K.function([self.actor.input, action, advantages], [], updates=updates)
return train # make loss function for Value approximation
def critic_optimizer(self):
discounted_reward = K.placeholder(shape=(None, )) value = self.critic.output loss = K.mean(K.square(discounted_reward - value)) optimizer = Adam(lr=self.critic_lr)
updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
train = K.function([self.critic.input, discounted_reward], [], updates=updates)
return train # make agents(local) and start training
def train(self):
# self.load_model('./save_model/cartpole_a3c.h5')
agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor,
self.action_size, self.state_size) for i in range(self.threads)]#建立8个local agent for agent in agents:
agent.start() while True:
time.sleep(20) plot = scores[:]
pylab.plot(range(len(plot)), plot, 'b')
pylab.savefig("./save_graph/cartpole_a3c.png") self.save_model('./save_model/cartpole_a3c.h5') def save_model(self, name):
self.actor.save_weights(name + "_actor.h5")
self.critic.save_weights(name + "_critic.h5") def load_model(self, name):
self.actor.load_weights(name + "_actor.h5")
self.critic.load_weights(name + "_critic.h5") # This is Agent(local) class for threading
class Agent(threading.Thread):
def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size):
threading.Thread.__init__(self) self.states = []
self.rewards = []
self.actions = [] self.index = index
self.actor = actor
self.critic = critic
self.optimizer = optimizer
self.env_name = env_name
self.discount_factor = discount_factor
self.action_size = action_size
self.state_size = state_size # Thread interactive with environment
def run(self):
global episode
env = gym.make(self.env_name)
while episode < EPISODES:
state = env.reset()
score = 0
while True:
action = self.get_action(state)
next_state, reward, done, _ = env.step(action)
score += reward self.memory(state, action, reward) state = next_state if done:
episode += 1
print("episode: ", episode, "/ score : ", score)
scores.append(score)
self.train_episode(score != 500)
break # In Policy Gradient, Q function is not available.
# Instead agent uses sample returns for evaluating policy
def discount_rewards(self, rewards, done=True):
discounted_rewards = np.zeros_like(rewards)
running_add = 0
if not done:
running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0]
for t in reversed(range(0, len(rewards))):
running_add = running_add * self.discount_factor + rewards[t]
discounted_rewards[t] = running_add
return discounted_rewards # save <s, a ,r> of each step
# this is used for calculating discounted rewards
def memory(self, state, action, reward):
self.states.append(state)
act = np.zeros(self.action_size)
act[action] = 1
self.actions.append(act)
self.rewards.append(reward) # update policy network and value network every episode
def train_episode(self, done):
discounted_rewards = self.discount_rewards(self.rewards, done) values = self.critic.predict(np.array(self.states))
values = np.reshape(values, len(values)) advantages = discounted_rewards - values self.optimizer[0]([self.states, self.actions, advantages])
self.optimizer[1]([self.states, discounted_rewards])
self.states, self.actions, self.rewards = [], [], [] def get_action(self, state):
policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]
return np.random.choice(self.action_size, 1, p=policy)[0] if __name__ == "__main__":
env_name = 'CartPole-v1'
env = gym.make(env_name) state_size = env.observation_space.shape[0]
action_size = env.action_space.n env.close() global_agent = A3CAgent(state_size, action_size, env_name)
global_agent.train()

深度增强学习--A3C的更多相关文章

  1. 深度增强学习--DPPO

    PPO DPPO介绍 PPO实现 代码DPPO

  2. 深度增强学习--DDPG

    DDPG DDPG介绍2 ddpg输出的不是行为的概率, 而是具体的行为, 用于连续动作 (continuous action) 的预测 公式推导 推导 代码实现的gym的pendulum游戏,这个游 ...

  3. 深度增强学习--DQN的变形

    DQN的变形 double DQN prioritised replay dueling DQN

  4. 深度增强学习--Actor Critic

    Actor Critic value-based和policy-based的结合 实例代码 import sys import gym import pylab import numpy as np ...

  5. 深度增强学习--Policy Gradient

    前面都是value based的方法,现在看一种直接预测动作的方法 Policy Based Policy Gradient 一个介绍 karpathy的博客 一个推导 下面的例子实现的REINFOR ...

  6. 深度增强学习--Deep Q Network

    从这里开始换个游戏演示,cartpole游戏 Deep Q Network 实例代码 import sys import gym import pylab import random import n ...

  7. 常用增强学习实验环境 II (ViZDoom, Roboschool, TensorFlow Agents, ELF, Coach等) (转载)

    原文链接:http://blog.csdn.net/jinzhuojun/article/details/78508203 前段时间Nature上发表的升级版Alpha Go - AlphaGo Ze ...

  8. 马里奥AI实现方式探索 ——神经网络+增强学习

    [TOC] 马里奥AI实现方式探索 --神经网络+增强学习 儿时我们都曾有过一个经典游戏的体验,就是马里奥(顶蘑菇^v^),这次里约奥运会闭幕式,日本作为2020年东京奥运会的东道主,安倍最后也已经典 ...

  9. 增强学习 | AlphaGo背后的秘密

    "敢于尝试,才有突破" 2017年5月27日,当今世界排名第一的中国棋手柯洁与AlphaGo 2.0的三局对战落败.该事件标志着最新的人工智能技术在围棋竞技领域超越了人类智能,借此 ...

随机推荐

  1. 从零开始编写自己的C#框架(8)——后台管理系统功能设计

    还是老规矩先吐下槽,在规范的开发过程中,这个时候应该是编写总体设计(概要设计)的时候,不过对于中小型项目来说,过于规范的遵守软件工程,编写太多文档也会拉长进度,一般会将它与详细设计合并到一起来处理,所 ...

  2. noip模拟赛(10.4) 背包(pack)

    [题目描述] 蛤布斯有n种商品,第i种物品的价格为ai,价值为bi.有m个人来向蛤布斯购买商品,每个人每种物品只能购买一个.第j个人有cj的钱,他会不停选择一个能买得起的价格最高的商品买走(如果有多个 ...

  3. openstack 常用命令

    转自: docs.openstack.org $ nova boot --image ubuntu-cloudimage --flavor 1 --user-data mydata.file

  4. c++ encode decode

    std::string UrlEncode(const std::string& szToEncode) { std::string src = szToEncode; char hex[] ...

  5. Linux操作系统基础(四)保护模式内存管理(2)【转】

    转自:http://blog.csdn.net/rosetta/article/details/8570681 Linux操作系统基础(四)保护模式内存管理(2) 转载请注明出处:http://blo ...

  6. Java 使用httpclient Post与cxf 发布的Webservice通信

    使用cxf发布的webservice不知道什么情况总会有时管用有时不管用,对于项目来说这肯定不行.又不想改动webservice因为代码太多.人懒! 于是便使用httpclient与webservic ...

  7. Git详细教程---多人协作开发

    Git可以完成两件事情: 1. 版本控制 2.多人协作开发 如今的项目,规模越来越大,功能越来越多,需要有一个团队进行开发. 如果有多个开发人员共同开发一个项目,如何进行协作的呢. Git提供了一个非 ...

  8. phpstudy APACHE支持.htaccess以及 No input file specified解决方案

    APACHE支持.htaccess以及 No input file specified解决方案 你的Apache安装文件夹conf里找到httpd.conf文件 索LoadModule rewrite ...

  9. BZOJ2693jzptab

    简单般Bzoj2154: Crash的数字表格 Sol 增加了数据组数T<=10000 推到 \(ans=\sum_{d=1}^{N}d*\sum_{i=1}^{\lfloor\frac{N}{ ...

  10. nexus私服搭建及maven生命周期

    一.maven找库流程 从流程上看创建nexus私服,能够优化流程,而且更加快速 二.nexus下载.安装 1.nexus下载地址 https://sonatype-download.global.s ...