zoukankan      html  css  js  c++  java
  • Hands-on ML and TF Chapter16 Reinforcement Learning

    Policy Granients

    import tensorflow as tf
    
    reset_graph()
    
    n_inputs = 4
    n_hidden = 4
    n_outputs = 1
    
    learning_rate = 0.01
    
    initializer = tf.variance_scaling_initializer()
    
    X = tf.placeholder(tf.float32, shape=[None, n_inputs])
    
    hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
    logits = tf.layers.dense(hidden, n_outputs)
    outputs = tf.nn.sigmoid(logits)  # probability of action 0 (left)
    p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
    action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
    
    y = 1. - tf.to_float(action)
    cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(cross_entropy)
    gradients = [grad for grad, variable in grads_and_vars]
    gradient_placeholders = []
    grads_and_vars_feed = []
    for grad, variable in grads_and_vars:
        gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
        gradient_placeholders.append(gradient_placeholder)
        grads_and_vars_feed.append((gradient_placeholder, variable))
    training_op = optimizer.apply_gradients(grads_and_vars_feed)
    
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
    def discount_rewards(rewards, discount_rate):
        discounted_rewards = np.zeros(len(rewards))
        cumulative_rewards = 0
        for step in reversed(range(len(rewards))):
            cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
            discounted_rewards[step] = cumulative_rewards
        return discounted_rewards
    
    def discount_and_normalize_rewards(all_rewards, discount_rate):
        all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
        flat_rewards = np.concatenate(all_discounted_rewards)
        reward_mean = flat_rewards.mean()
        reward_std = flat_rewards.std()
        return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
    
    env = gym.make("CartPole-v0")
    
    n_games_per_update = 10
    n_max_steps = 1000
    n_iterations = 250
    save_iterations = 10
    discount_rate = 0.95
    
    with tf.Session() as sess:
        init.run()
        for iteration in range(n_iterations):
            print("
    Iteration: {}".format(iteration), end="")
            all_rewards = []
            all_gradients = []
            for game in range(n_games_per_update):
                current_rewards = []
                current_gradients = []
                obs = env.reset()
                for step in range(n_max_steps):
                    action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
                    obs, reward, done, info = env.step(action_val[0][0])
                    current_rewards.append(reward)
                    current_gradients.append(gradients_val)
                    if done:
                        break
                all_rewards.append(current_rewards)
                all_gradients.append(current_gradients)
    
            all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
            feed_dict = {}
            for var_index, gradient_placeholder in enumerate(gradient_placeholders):
                mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
                                          for game_index, rewards in enumerate(all_rewards)
                                              for step, reward in enumerate(rewards)], axis=0)
                ## mean_gradients 是个标量,为一个Variable在10个episode的所有step的梯度的平均值。
                feed_dict[gradient_placeholder] = mean_gradients
            sess.run(training_op, feed_dict=feed_dict)
            if iteration % save_iterations == 0:
                saver.save(sess, "./my_policy_net_pg.ckpt")
    

    Markov Decision Process

    Learning to Play Ms. Pac-Man Using Deep Q-Learning

    这部分首先解决在Windows下安装gym[atari]模块

    查阅了很多资料,发现很多解决办法都是使用这个连接:

    pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py
    

    这种的方法的出处是OpenAI Gym Atari on Windows

    这个gym的github issue中也提到这个方法。

    这个How to Install OpenAI Gym in a Windows Environment也是使用这个方法安装atari模块

    也可以在下面这个网址上下载whl进行离线安装

    https://github.com/Kojoley/atari-py/releases

    对于安装whl格式的文件,首先要安装wheel包

    pip install wheel
    

    由于anconda中默认安装了wheel包,所以上面的步骤可以省略。

    pip install atari_py-0.1.7-cp36-cp36m-win_amd64.whl
    

    绘制智能体和环境交互的动画

    # 导入需要的包
    import matplotlib
    import matplotlib.animation as animation
    import matplotlib.pyplot as plt
    plt.rcParams['axes.labelsize'] = 14
    plt.rcParams['xtick.labelsize'] = 12
    plt.rcParams['ytick.labelsize'] = 12
    
    # 将每一帧的画面存到frames中
    frames = []
    
    n_max_steps = 1000
    n_change_steps = 10
    
    obs = env.reset()
    for step in range(n_max_steps):
        img = env.render(mode="rgb_array")
        frames.append(img)
        if step % n_change_steps == 0:
            action = env.action_space.sample() # play randomly
        obs, reward, done, info = env.step(action)
        if done:
            break
    
    # 定义绘制动画的函数
    def update_scene(num, frames, patch):
        patch.set_data(frames[num])
        return patch,
    
    def plot_animation(frames, repeat=False, interval=40):
        plt.close()  # or else nbagg sometimes plots in the previous cell
        fig = plt.figure()
        patch = plt.imshow(frames[0])
        plt.axis('off')
        return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)
    
    # 调用函数显示动画
    video = plot_animation(frames)
    plt.show()
    

    DQL算法部分

    首先是对obs进行预处理,转换为88x80的灰度图像并增强对比度。

    import gym
    import matplotlib.pyplot as plt
    import numpy as np
    
    env = gym.make('MsPacman-v0')
    obs = env.reset()
    plt.imshow(obs)
    plt.show()
    
    mspacman_color = np.array([210, 164, 74]).mean()
    
    def preprocess_observation(obs):
        img = obs[1:176:2,::2] # crop and downsize
        img = img.mean(axis=2) # to greyscale
        img[img == mspacman_color] = 0 # improve constrast
        img = (img - 128)/128 -1 # normalize from -1. to 1.
        return img.reshape(88,80,1)
    
    pre_obs = preprocess_observation(obs).reshape(88,80)
    plt.imshow(pre_obs, cmap ='gray')
    plt.show()
    

    DQN算法

    # ------------------------------构建DQN的计算图-----------------------------------------
    from tensorflow.contrib.layers import convolution2d, fully_connected
    
    input_height = 88
    input_width = 80
    input_channels = 1
    conv_n_maps = [32, 64, 64]
    conv_kernel_sizes = [(8,8), (4,4), (3,3)]
    conv_strides = [4, 2, 1]
    conv_paddings = ["SAME"] * 3 
    conv_activation = [tf.nn.relu] * 3
    n_hidden_in = 64 * 11 * 10  # conv3 has 64 maps of 11x10 each
    n_hidden = 512
    hidden_activation = tf.nn.relu
    n_outputs = env.action_space.n  # 9 discrete actions are available
    initializer = tf.variance_scaling_initializer()
    
    def q_network(X_state, name):
        prev_layer = X_state / 128.0 # scale pixel intensities to the [-1.0, 1.0] range.
        with tf.variable_scope(name) as scope:
            for n_maps, kernel_size, strides, padding, activation in zip(
                    conv_n_maps, conv_kernel_sizes, conv_strides,
                    conv_paddings, conv_activation):
                prev_layer = tf.layers.conv2d(
                    prev_layer, filters=n_maps, kernel_size=kernel_size,
                    strides=strides, padding=padding, activation=activation,
                    kernel_initializer=initializer)
            last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])
            hidden = tf.layers.dense(last_conv_layer_flat, n_hidden,
                                     activation=hidden_activation,
                                     kernel_initializer=initializer)
            outputs = tf.layers.dense(hidden, n_outputs,
                                      kernel_initializer=initializer)
        trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           scope=scope.name)
        trainable_vars_by_name = {var.name[len(scope.name):]: var
                                  for var in trainable_vars}
        return outputs, trainable_vars_by_name
    
    # --------------------------------两个DQN之间复制参数的过程----------------------------------
    X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width,
                                                input_channels])
    online_q_values, online_vars = q_network(X_state, name="q_networks/online")
    target_q_values, target_vars = q_network(X_state, name="q_networks/target")
    
    copy_ops = [target_var.assign(online_vars[var_name])
                for var_name, target_var in target_vars.items()]
    copy_online_to_target = tf.group(*copy_ops)
    
    # ------------------------------构建actor的损失函数和优化器
    learning_rate = 0.001
    momentum = 0.95
    
    with tf.variable_scope("train"):
        X_action = tf.placeholder(tf.int32, shape=[None])
        y = tf.placeholder(tf.float32, shape=[None, 1])
        q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),
                                axis=1, keepdims=True)
        error = tf.abs(y - q_value)
        clipped_error = tf.clip_by_value(error, 0.0, 1.0)
        linear_error = 2 * (error - clipped_error)
        loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)
    
        global_step = tf.Variable(0, trainable=False, name='global_step')
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
        training_op = optimizer.minimize(loss, global_step=global_step)
    
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
    #----------------------------replay mem的设置代替书中过的deque-----------------------------------
    class ReplayMemory:
        def __init__(self, maxlen):
            self.maxlen = maxlen
            self.buf = np.empty(shape=maxlen, dtype=np.object)
            self.index = 0
            self.length = 0
            
        def append(self, data):
            self.buf[self.index] = data
            self.length = min(self.length + 1, self.maxlen)
            self.index = (self.index + 1) % self.maxlen
        
        def sample(self, batch_size, with_replacement=True):
            if with_replacement:
                indices = np.random.randint(self.length, size=batch_size) # faster
            else:
                indices = np.random.permutation(self.length)[:batch_size]
            return self.buf[indices]
        
    replay_memory_size = 500000
    replay_memory = ReplayMemory(replay_memory_size)
    
    #----------------------------------从replay mem中进行采样--------------------------------------
    def sample_memories(batch_size):
        cols = [[], [], [], [], []] # state, action, reward, next_state, continue
        for memory in replay_memory.sample(batch_size):
            for col, value in zip(cols, memory):
                col.append(value)
        cols = [np.array(col) for col in cols]
        return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)
    
    #-------------------------------epsilon贪婪策略-----------------------------------------
    eps_min = 0.1
    eps_max = 1.0
    eps_decay_steps = 2000000
    
    def epsilon_greedy(q_values, step):
        epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
        if np.random.rand() < epsilon:
            return np.random.randint(n_outputs) # random action
        else:
            return np.argmax(q_values) # optimal action
    
    #-------------------------------DQN的训练过程--------------------------------------------
    n_steps = 4000000  # total number of training steps
    training_start = 10000  # start training after 10,000 game iterations
    training_interval = 4  # run a training step every 4 game iterations
    save_steps = 1000  # save the model every 1,000 training steps
    copy_steps = 10000  # copy online DQN to target DQN every 10,000 training steps
    discount_rate = 0.99
    skip_start = 90  # Skip the start of every game (it's just waiting time).
    batch_size = 50
    iteration = 0  # game iterations
    checkpoint_path = "./my_dqn.ckpt"
    done = True # env needs to be reset
    
    loss_val = np.infty
    game_length = 0
    total_max_q = 0
    mean_max_q = 0.0
    
    with tf.Session() as sess:
        if os.path.isfile(checkpoint_path + ".index"):
            saver.restore(sess, checkpoint_path)
        else:
            init.run()
            copy_online_to_target.run()
        while True:
            step = global_step.eval()
            if step >= n_steps:
                break
            iteration += 1
            print("
    Iteration {}	Training step {}/{} ({:.1f})%	Loss {:5f}	Mean Max-Q {:5f}   ".format(
                iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="")
            if done: # game over, start again
                obs = env.reset()
                for skip in range(skip_start): # skip the start of each game
                    obs, reward, done, info = env.step(0)
                state = preprocess_observation(obs)
    
            # Online DQN evaluates what to do
            q_values = online_q_values.eval(feed_dict={X_state: [state]})
            action = epsilon_greedy(q_values, step)
    
            # Online DQN plays
            obs, reward, done, info = env.step(action)
            next_state = preprocess_observation(obs)
    
            # Let's memorize what happened
            replay_memory.append((state, action, reward, next_state, 1.0 - done))
            state = next_state
    
            # Compute statistics for tracking progress (not shown in the book)
            total_max_q += q_values.max()
            game_length += 1
            if done:
                mean_max_q = total_max_q / game_length
                total_max_q = 0.0
                game_length = 0
    
            if iteration < training_start or iteration % training_interval != 0:
                continue # only train after warmup period and at regular intervals
            
            # Sample memories and use the target DQN to produce the target Q-Value
            X_state_val, X_action_val, rewards, X_next_state_val, continues = (
                sample_memories(batch_size))
            next_q_values = target_q_values.eval(
                feed_dict={X_state: X_next_state_val})
            max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
            y_val = rewards + continues * discount_rate * max_next_q_values
    
            # Train the online DQN
            _, loss_val = sess.run([training_op, loss], feed_dict={
                X_state: X_state_val, X_action: X_action_val, y: y_val})
    
            # Regularly copy the online DQN to the target DQN
            if step % copy_steps == 0:
                copy_online_to_target.run()
    
            # And save regularly
            if step % save_steps == 0:
                saver.save(sess, checkpoint_path)
    
  • 相关阅读:
    LeetCode之移除元素
    有被开心到hh(日常)
    交换排序
    插入排序
    顺序查找&折半查找
    C++之引用
    MySQL学习笔记
    C/C++程序编译过程
    计算机面试知识整合(更新中...)
    MFC之编辑框
  • 原文地址:https://www.cnblogs.com/ZeroTensor/p/10925951.html
Copyright © 2011-2022 走看看