- # encoding:utf-8
- import numpy as np
- import matplotlib.pylab as plt
- '''
- 随机行走问题
- 0 - 1 - 2 - 3 - 4 - 5 - 6
- e s e
- 0 终点 r 为 0. 6 终点 r 为 1
- 中间每个选择 r 为 0
- 策略 [-1, 1] 每种选择 0.5, -1 向左, 1 向右
- 这个策略下, 理论上数字越大回报越高
- '''
- stats = range(7)
- start = 3
- end = [0, 6]
- actions = [-1, 1]
- r = 1 # 衰减因子
- alpha = 0.5 # 学习率
- echos = [5, 10, 50, 100, 500, 1000, 10000]
- def choose_act(stat):
- # 策略
- if np.random.rand()> 0.5:
- return 1
- else:
- return -1
- v = np.zeros([len(stats)])
- for i in echos:
- for j in range(i):
- act = choose_act(start)
- stat_ = start + act
- if stat_ in end:
- if stat_ == 6:
- v[start] += alpha * (1 + v[stat_] - v[start])
- else:
- v[start] += alpha * (v[stat_] - v[start])
- start = np.random.randint(1,6)
- else:
- v[start] += alpha * (v[stat_] - v[start])
- start = np.random.randint(1,6)
- plt.plot(v[1:-1])
- plt.text(stats[-4], v[-3], j+1)
- plt.xlabel('state')
- plt.ylabel('v')
- plt.text(1, 0.8, 'alpha = %s'%alpha)
- plt.show()
可以看到 随着学习率的增大, 效果越来越好, 当学习率为 0.5 时, 已经明显过拟合了
- # encoding:utf-8
- from __future__ import division
- __author__ = 'HP'
- import numpy as np
- import matplotlib.pylab as plt
- stats = range(7)
- end = [0, 6]
- actions = [-1, 1]
- r = 1 # 衰减因子
- def choose_act(stat):
- # 策略
- if np.random.rand()> 0.5:
- return 1
- else:
- return -1
- v_t = [0, 1/6, 1/3, 1/2, 2/3, 5/6, 0]
- alpha_td = [0.1, 0.15, 0.2] # 学习率
- alpha_mc = [0.01, 0.02, 0.04]
- for c in range(3):
- # TD
- alpha = alpha_td[c]
- # v = np.random.rand(len(stats))
- # v = np.zeros(len(stats))
- v = [0.2] * len(stats)
- errors = []
- start = 3
- for j in range(100):
- act = choose_act(start)
- stat_ = start + act
- if stat_ in end:
- if stat_ == 6:
- v[start] += alpha * (1 + v[stat_] - v[start])
- else:
- v[start] += alpha * (v[stat_] - v[start])
- start = np.random.randint(1,6)
- else:
- v[start] += alpha * (v[stat_] - v[start])
- start = stat_ # np.random.randint(1,6)
- error = np.sqrt(sum([pow(value - v_t[index], 2) for index, value in enumerate(v)]))
- errors.append(error)
- plt.plot(range(100), errors)
- index = np.random.randint(40,100)
- plt.text(index-3, errors[index], 'alpha_td = %s'%alpha)
- # MC
- alpha = alpha_mc[c]
- # v_mc = np.random.rand(len(stats))
- # v_mc = np.zeros(len(stats))
- v_mc = [0.2] * len(stats)
- count_mc = np.zeros(len(stats))
- errors = []
- for j in range(100):
- process = []
- start = 3 # np.random.randint(1, 6)
- while True:
- if start in end:
- process.append([start])
- break
- act = choose_act(start)
- if start == 5 and act == 1:
- r = 1
- else:
- r = 0
- process.append([start, act, r])
- start = start + act
- T = len(process[:-1])
- s_all = [i[0] for i in process[:-1]]
- s_dealed = []
- for k in range(T):
- sar = process[k]
- s = sar[0]
- if s in s_dealed:continue
- # first visit
- t = s_all.index(s) # 该 s 首次出现的位置
- num = s_all.count(s) # 该 s 总共出现的次数
- r_all = sum([i[2] for i in process[t:-1]]) / num
- v_mc[s] += alpha * (r_all - v_mc[s])
- # v_mc[s] = (v_mc[s] * count_mc[s] + r_all) / (count_mc[s] + 1)
- # count_mc[s] += 1
- s_dealed.append(s)
- error = np.sqrt(sum([pow(value - v_t[index], 2) for index, value in enumerate(v_mc)]))
- errors.append(error)
- plt.plot(range(100), errors, '.')
- index = np.random.randint(40,100)
- plt.text(index-3, errors[index], 'alpha_mc = %s'%alpha)
- plt.xlabel('echo')
- plt.ylabel('mse')
- plt.show()
随机行走有个特殊性: 两个终点, 有一个终点奖励为 0, 也就是说在前几个回合中, 单步更新的 TD 如果一开始向左走, 需要好多步才能到达右边终点, 而 MC 由于是整个回合, 要么左, 要么右, 先到右边终点的概率要大得多, 所以, 前几步 MC 收敛明显比 TD 快
但是从总体来看, TD 收敛比 MC 要快, 而且收敛值要小, 故 TD 效率更高
来源: http://www.bubuko.com/infodetail-2959416.html