egg: A Painless Q-learning Tutorial (一个 Q-learning 算法的简明教程)
第一种:
import numpy as np import random import copy #first # initial初始化 q = np.zeros([6, 6]) #6*6 矩阵 q = np.matrix(q) r = np.array([[-1, -1, -1, -1, 0, -1], [-1, -1, -1, 0, -1, 100], [-1, -1, -1, 0, -1, -1], [-1, 0, 0, -1, 0, -1], [0, -1, -1, 0, -1, 100], [-1, 0, -1, -1, 0, 100]]) r = np.matrix(r) gamma = 0.8 #折扣因子 e= np.matrix(np.zeros([6, 6])) for i in range(100): # one episode #s随机状态 state = random.randint(0, 5) while (state != 5): # choose positive r-value action randomly #有效路径集合 用于获取下一个可能的状态 r_pos_action = [] for action in range(6): #可以通过的路径 if r[state, action] >= 0: r_pos_action.append(action) #下一个状态 随机获取 random.randint(a,b)==[a,b] next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)] q[state, next_state] = r[state, next_state] + gamma * q[next_state].max() #e 与 q 不同 if (e-q).any()==True: e=copy.copy(q) print("%s----%s"%(state,next_state)) print(q) print(" ") state = next_state
得到策略矩阵Q:
[[ 0. 0. 0. 0. 80. 0. ] [ 0. 0. 0. 64. 0. 100. ] [ 0. 0. 0. 64. 0. 0. ] [ 0. 80. 51.2 0. 80. 0. ] [ 64. 0. 0. 64. 0. 100. ] [ 0. 0. 0. 0. 0. 0. ]]
第二种:
#设定初始 q 和 reward q = np.zeros((6,6)) rewards = np.zeros((6,6)) ; rewards[:,5]=500 # 可行的动作 actions = [[4],[3,5],[3],[1,2,4],[0,3,5],[1,4,5]] def trial(): #初始化状态 s = random.randint(0,6) while s<5: #choice方法返回一个列表,元组或字符串的随机项 s1 = a = random.choice(actions[s]) q[s,a]=rewards[s,a]+0.8*q[s1].max() s = s1 for i in range(200): trial() print (q) def test(s): print (s) while s<5: #返回的是最大数的索引 s = q[s].argmax() print ("->%s"% s) test(2)
得到策略矩阵Q及测试状态2:
[[ 0. 0. 0. 0. 400. 0.] [ 0. 0. 0. 320. 0. 500.] [ 0. 0. 0. 320. 0. 0.] [ 0. 400. 256. 0. 400. 0.] [ 320. 0. 0. 320. 0. 500.] [ 0. 0. 0. 0. 0. 0.]] 2 ->3 ->1 ->5