# Python 3.6.5 :: Anaconda, Inc. import numpy as np import pandas as pd import time np.random.seed(2) N_STATUS = 5 ACTIONS = ['left', 'right'] EPSILON = 0.9 ALPHA = 0.1 LAMBDA = 0.9 MAX_EPISODES = 13 FRESH_TIME = 0.1 def build_q_table(n_status, actions): table = pd.DataFrame( np.zeros((n_status, len(actions))), columns=actions, ) #print(table) return table #build_q_table(5,[1]) def choose_action(state, q_table): state_actions = q_table.iloc[state, :] if (np.random.uniform() > EPSILON or (state_actions.all() == 0)): action_name = np.random.choice(ACTIONS) else: action_name = state_actions.argmax() return action_name def get_env_feedback(S, A): if A == 'right': if S == N_STATUS - 2: S_ = 'terminal' R = 1 else: S_ = S + 1 R = 0 else: R = 0 if S == 0: S_ = S else: S_ = S - 1 return S_, R def update_env(S, episode, step_counter): env_list = ['-']*(N_STATUS-1)+['T'] if S == 'terminal': interaction = 'Episode %d: total_steps = %s' % (episode+1, step_counter) print(' {}'.format(interaction), end='') time.sleep(1) print(' ', end='') else: env_list[S] = 'o' interaction = ''.join(env_list) print(' {}'.format(interaction), end='') time.sleep(FRESH_TIME) def rl(): q_table = build_q_table(N_STATUS, ACTIONS) for episode in range(MAX_EPISODES): step_counter = 0 S = 0 is_terminated = False update_env(S, episode, step_counter) while not is_terminated: A = choose_action(S, q_table) S_, R = get_env_feedback(S, A) q_predict = q_table.ix[S, A] if S_ != 'terminal': q_target = R + LAMBDA*q_table.iloc[S_, :].max() else: q_target = R is_terminated = True q_table.ix[S, A] += ALPHA*(q_target - q_predict) S = S_ update_env(S, episode, step_counter+1) step_counter += 1 return q_table if __name__ == "__main__": q_table = rl() print(' Q-table: ') print(q_table)