zoukankan      html  css  js  c++  java
  • 强化学习6MC与TD的比较实战

    # encoding:utf-8
    import numpy as np
    import matplotlib.pylab as plt
    
    '''
    随机行走问题
    0 - 1 - 2 - 3 - 4 - 5 - 6
    e           s           e
    0终点r为0. 6终点r为1
    中间每个选择r为0
    
    策略 [-1, 1] 每种选择0.5, -1向左,1向右
    这个策略下,理论上数字越大回报越高
    '''
    
    stats = range(7)
    start = 3
    end = [0, 6]
    actions = [-1, 1]
    
    r = 1   # 衰减因子
    alpha = 0.5 # 学习率
    echos = [5, 10, 50, 100, 500, 1000, 10000]
    
    def choose_act(stat):
        # 策略
        if np.random.rand() > 0.5:
            return 1
        else:
            return -1
    
    v = np.zeros([len(stats)])
    
    for i in echos:
        for j in range(i):
            act = choose_act(start)
            stat_ = start + act
    
            if stat_ in end:
                if stat_ == 6:
                    v[start] += alpha * (1 + v[stat_] - v[start])
                else:
                    v[start] += alpha * (v[stat_] - v[start])
                start = np.random.randint(1,6)
            else:
                v[start] += alpha * (v[stat_] - v[start])
                start = np.random.randint(1,6)
    
        plt.plot(v[1:-1])
        plt.text(stats[-4], v[-3], j+1)
    
    plt.xlabel('state')
    plt.ylabel('v')
    plt.text(1, 0.8, 'alpha = %s'%alpha)
    plt.show()

    可以看到 随着学习率的增大,效果越来越好,当学习率为0.5时,已经明显过拟合了

    这个是单步的,书上是单回合的,所以不同,后续有空会更新代码

    # encoding:utf-8
    from __future__ import division
    import numpy as np
    import matplotlib.pylab as plt
    
    stats = range(7)
    end = [0, 6]
    actions = [-1, 1]
    r = 1   # 衰减因子
    
    def choose_act(stat):
        # 策略
        if np.random.rand() > 0.5:
            return 1
        else:
            return -1
    
    v_t = [0, 1/6, 1/3, 1/2, 2/3, 5/6, 0]
    alpha_td = [0.1, 0.15, 0.2] # 学习率
    alpha_mc = [0.01, 0.02, 0.04]
    for c in range(3):
        # TD
        alpha = alpha_td[c]
        # v = np.random.rand(len(stats))        
        # v = np.zeros(len(stats))
        v = [0.2] * len(stats)
        errors = []
        start = 3
    
        for j in range(100):
            act = choose_act(start)
            stat_ = start + act
    
            if stat_ in end:
                if stat_ == 6:
                    v[start] += alpha * (1 + v[stat_] - v[start])
                else:
                    v[start] += alpha * (v[stat_] - v[start])
                start = np.random.randint(1,6)
            else:
                v[start] += alpha * (v[stat_] - v[start])
                start = stat_   # np.random.randint(1,6)
    
            error = np.sqrt(sum([pow(value - v_t[index], 2) for index, value in enumerate(v)]))
            errors.append(error)
    
        plt.plot(range(100), errors)
        index = np.random.randint(40,100)
        plt.text(index-3, errors[index], 'alpha_td = %s'%alpha)
    
        # MC
        alpha = alpha_mc[c]
        # v_mc = np.random.rand(len(stats))
        # v_mc = np.zeros(len(stats))
        v_mc = [0.2] * len(stats)
        count_mc = np.zeros(len(stats))
        errors = []
        for j in range(100):
            process = []
            start = 3   # np.random.randint(1, 6)
            while True:
                if start in end:
                    process.append([start])
                    break
                act = choose_act(start)
                if start == 5 and act == 1:
                    r = 1
                else:
                    r = 0
                process.append([start, act, r])
                start = start + act
    
            T = len(process[:-1])
            s_all = [i[0] for i in process[:-1]]
            s_dealed = []
            for k in range(T):
                sar = process[k]
                s = sar[0]
                if s in s_dealed:continue
    
                # first visit
                t = s_all.index(s)     # 该s 首次出现的位置
                num = s_all.count(s)   # 该s 总共出现的次数
                r_all = sum([i[2] for i in process[t:-1]]) / num
                v_mc[s] += alpha * (r_all - v_mc[s])
                # v_mc[s] = (v_mc[s] * count_mc[s] + r_all) / (count_mc[s] + 1)
                # count_mc[s] += 1
    
                s_dealed.append(s)
            error = np.sqrt(sum([pow(value - v_t[index], 2) for index, value in enumerate(v_mc)]))
            errors.append(error)
        plt.plot(range(100), errors, '.')
        index = np.random.randint(40,100)
        plt.text(index-3, errors[index], 'alpha_mc = %s'%alpha)
    
    plt.xlabel('echo')
    plt.ylabel('mse')
    plt.show()

    随机行走有个特殊性:两个终点,有一个终点奖励为0,也就是说在前几个回合中,单步更新的TD如果一开始向左走,需要好多步才能到达右边终点,而MC由于是整个回合,要么左,要么右,先到右边终点的概率要大得多,所以,前几步MC收敛明显比TD快

    但是从总体来看,TD收敛比MC要快,而且收敛值要小,故TD效率更高

    上述代码的问题

    1.TD 是单步计算MSE,而MC是单回合计算MSE,比较的前提不同

    2.在计算MSE时,只是计算了一次评估的误差,并不是平均误差

    更新代码

  • 相关阅读:
    [转载自 文顶顶]iOS开发UI篇—程序启动原理和UIApplication
    中专生自学Android到找到工作的前前后后
    HashCode与IEqualityComparer接口
    abp vnext使用记录之微服务
    asp.net core 外部认证多站点模式实现
    xamarin.android 绑定百度地图SDK遇到的问题
    asp.net core 之多语言国际化自定义资源文件
    asp.net core tags 扩展之 id 和 name
    asp.net 后台任务作业框架收集
    向量
  • 原文地址:https://www.cnblogs.com/yanshw/p/10396798.html
Copyright © 2011-2022 走看看