96SEO 2026-02-19 12:21 0
href="https://www.cnblogs.com/ljbguanli/p/19623316"

name="tableOfContents">目录
id="%E4%B8%80%E3%80%81%E5%BC%95%E8%A8%80-toc"
id="%E4%BA%8C%E3%80%81Double%20DQN%20%E7%AE%97%E6%B3%95-toc"
id="%EF%BC%88%E4%B8%80%EF%BC%89Double%20DQN%20%E7%9A%84%E6%8F%90%E5%87%BA%E8%83%8C%E6%99%AF%EF%BC%9A%E8%A7%A3%E5%86%B3%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%20Q%20%E5%80%BC%E8%BF%87%E4%BC%B0%E8%AE%A1%E9%97%AE%E9%A2%98-toc"
id="%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E7%9B%AE%E6%A0%87%20Q%20%E5%80%BC%E8%AE%A1%E7%AE%97%E9%80%BB%E8%BE%91-toc"
id="%EF%BC%88%E4%BA%8C%EF%BC%89Double%20DQN%20%E7%9A%84%E6%A0%B8%E5%BF%83%E6%80%9D%E6%83%B3%EF%BC%9A%E2%80%9C%E9%80%89%E5%8A%A8%E4%BD%9C%E2%80%9D%20%E5%92%8C%20%E2%80%9C%E8%AF%84%E4%BB%B7%E5%80%BC%E2%80%9D%20%E8%A7%A3%E8%80%A6-toc"
id="Double%20DQN%20%E7%9A%84%E7%9B%AE%E6%A0%87%20Q%20%E5%80%BC%E8%AE%A1%E7%AE%97%E9%80%BB%E8%BE%91-toc"
id="%EF%BC%88%E4%B8%89%EF%BC%89Double%20DQN%20%E4%B8%8E%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E6%A0%B8%E5%BF%83%E5%AF%B9%E6%AF%94-toc"
id="1.%20%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E5%AE%9E%E7%8E%B0%EF%BC%88%E4%BB%A3%E7%A0%81%E7%89%87%E6%AE%B5%EF%BC%89-toc"
id="2.%20Double%20DQN%20%E7%9A%84%E5%AE%9E%E7%8E%B0%EF%BC%88%E4%BB%A3%E7%A0%81%E7%89%87%E6%AE%B5%EF%BC%89-toc"
id="%EF%BC%88%E5%9B%9B%EF%BC%89Double%20DQN%20%E7%9A%84%E5%AE%8C%E6%95%B4%E6%89%A7%E8%A1%8C%E6%B5%81%E7%A8%8B-toc"
id="%EF%BC%88%E4%BA%94%EF%BC%89Double%20DQN%20%E7%9A%84Python%E4%BB%A3%E7%A0%81%E5%AE%8C%E6%95%B4%E5%AE%9E%E7%8E%B0-toc"
id="%EF%BC%88%E5%85%AD%EF%BC%89%E7%A8%8B%E5%BA%8F%E8%BF%90%E8%A1%8C%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA%E4%B8%8E%E5%88%86%E6%9E%90-toc"
id="%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA-toc"
id="%E7%BB%93%E6%9E%9C%E5%88%86%E6%9E%90-toc"
id="%EF%BC%88%E4%B8%83%EF%BC%89Double%20DQN%20%E7%9A%84%E4%BC%98%E5%8A%BF%E4%B8%8E%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF-toc"
id="%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF-toc"
id="%EF%BC%88%E5%85%AB%EF%BC%89%E5%85%B3%E9%94%AE%E6%80%BB%E7%BB%93-toc"
id="%E4%B8%89%E3%80%81Dueling%20DQN%20%E7%AE%97%E6%B3%95-toc"
id="%EF%BC%88%E4%B8%80%EF%BC%89Dueling%20DQN%20%E7%9A%84%E6%8F%90%E5%87%BA%E8%83%8C%E6%99%AF%EF%BC%9A%E4%BC%98%E5%8C%96%20Q%20%E5%80%BC%E7%9A%84%E5%AD%A6%E4%B9%A0%E7%BB%93%E6%9E%84-toc"
id="%EF%BC%88%E4%BA%8C%EF%BC%89Dueling%20DQN%20%E7%9A%84%E6%A0%B8%E5%BF%83%E6%80%9D%E6%83%B3%EF%BC%9A%E6%8B%86%E5%88%86%20Q%20%E5%80%BC%E4%B8%BA%20V%20(s)%20%E5%92%8C%20A%20(s%2Ca)-toc"
id="1.%20%E6%A0%B8%E5%BF%83%E5%85%AC%E5%BC%8F%EF%BC%9AQ%20%E5%80%BC%E7%9A%84%E6%8B%86%E8%A7%A3%E4%B8%8E%E9%87%8D%E6%9E%84-toc"
id="2.%20%E5%85%B3%E9%94%AE%E6%94%B9%E8%BF%9B%EF%BC%9A%E8%A7%A3%E5%86%B3%20%E2%80%9C%E4%B8%8D%E5%8F%AF%E8%AF%86%E5%88%AB%E6%80%A7%E2%80%9D%20%E9%97%AE%E9%A2%98-toc"
id="%EF%BC%88%E4%B8%89%EF%BC%89Dueling%20DQN%20%E7%9A%84%E7%BD%91%E7%BB%9C%E7%BB%93%E6%9E%84-toc"
id="1.%20%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%20Qnet%EF%BC%88%E5%8D%95%E5%B1%82%E9%9A%90%E8%97%8F%E5%B1%82%EF%BC%89-toc"
id="2.%20Dueling%20DQN%20%E7%9A%84%20VAnet%EF%BC%88%E6%8B%86%E5%88%86%20V%20%E5%92%8C%20A%EF%BC%89-toc"
id="%EF%BC%88%E5%9B%9B%EF%BC%89Dueling%20DQN%20%E7%9A%84%E8%AE%AD%E7%BB%83%E9%80%BB%E8%BE%91%EF%BC%88%E5%92%8C%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E5%BC%82%E5%90%8C%EF%BC%89-toc"
id="1.%20%E5%88%9D%E5%A7%8B%E5%8C%96%E6%97%B6%E7%9A%84%E7%BD%91%E7%BB%9C%E9%80%89%E6%8B%A9-toc"
id="2.%20%E8%AE%AD%E7%BB%83%20%2F%20%E6%9B%B4%E6%96%B0%E9%80%BB%E8%BE%91-toc"
id="%EF%BC%88%E4%BA%94%EF%BC%89Dueling%20DQN%20%E7%9A%84Python%E4%BB%A3%E7%A0%81%E5%AE%8C%E6%95%B4%E5%AE%9E%E7%8E%B0-toc"
id="%EF%BC%88%E5%85%AD%EF%BC%89%E7%A8%8B%E5%BA%8F%E8%BF%90%E8%A1%8C%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA%E4%B8%8E%E5%88%86%E6%9E%90-toc"
id="%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA-toc"
id="%E7%BB%93%E6%9E%9C%E5%88%86%E6%9E%90-toc"
id="%EF%BC%88%E4%B8%83%EF%BC%89Dueling%20DQN%20%E7%9A%84%E4%BC%98%E5%8A%BF%E4%B8%8E%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF-toc"
id="%E6%A0%B8%E5%BF%83%E4%BC%98%E5%8A%BF-toc"
id="%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF-toc"
id="%EF%BC%88%E5%85%AB%EF%BC%89Dueling%20DQN%20vs%20Double%20DQN%EF%BC%9A%E6%A0%B8%E5%BF%83%E5%8C%BA%E5%88%AB-toc"
id="%EF%BC%88%E4%B9%9D%EF%BC%89%E5%85%B3%E9%94%AE%E6%80%BB%E7%BB%93-toc"
id="%E5%9B%9B%E3%80%81%E6%80%BB%E7%BB%93-toc"
name="tableOfContents">
id="%E4%B8%80%E3%80%81%E5%BC%95%E8%A8%80"
name="%E4%B8%80%E3%80%81%E5%BC%95%E8%A8%80">一、引言
DQN
算法敲开了深度强化学习的大门,但是作为先驱性的工作,其本身存在着一些问题以及一些可以改进的地方。
于是,在
之后,学术界涌现出了非常多的改进算法。
本文将介绍其中两个非常著名的算法:Double
id="%E4%BA%8C%E3%80%81Double%20DQN%20%E7%AE%97%E6%B3%95"
name="%E4%BA%8C%E3%80%81Double%20DQN%20%E7%AE%97%E6%B3%95"
id="%EF%BC%88%E4%B8%80%EF%BC%89Double%20DQN%20%E7%9A%84%E6%8F%90%E5%87%BA%E8%83%8C%E6%99%AF%EF%BC%9A%E8%A7%A3%E5%86%B3%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%20Q%20%E5%80%BC%E8%BF%87%E4%BC%B0%E8%AE%A1%E9%97%AE%E9%A2%98"
name="%EF%BC%88%E4%B8%80%EF%BC%89Double%20DQN%20%E7%9A%84%E6%8F%90%E5%87%BA%E8%83%8C%E6%99%AF%EF%BC%9A%E8%A7%A3%E5%86%B3%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%20Q%20%E5%80%BC%E8%BF%87%E4%BC%B0%E8%AE%A1%E9%97%AE%E9%A2%98">(一)Double
值时,存在一个关键问题:用同一个目标网络(target_q_net)既选择
id="%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E7%9B%AE%E6%A0%87%20Q%20%E5%80%BC%E8%AE%A1%E7%AE%97%E9%80%BB%E8%BE%91"
name="%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E7%9B%AE%E6%A0%87%20Q%20%E5%80%BC%E8%AE%A1%E7%AE%97%E9%80%BB%E8%BE%91">原始
src="https://i-blog.csdnimg.cn/direct/f64234ffe2454372a3dd1c9a75b6f2e4.png"
width="932">
src="https://i-blog.csdnimg.cn/direct/7aa13fdc876d4b72a51ac7ead124acee.png"
src="https://i-blog.csdnimg.cn/direct/6c53cb510e294347b530da8baaba6be4.png"
值对应的动作
问题根源:Q
网络的输出本身存在估计误差(比如噪声、训练不充分),而
src="https://i-blog.csdnimg.cn/direct/821ec3ab5c17435f84ba19dcfa1c0299.png"
值被系统性高估。
这种过估计会让智能体学习到错误的价值信号,最终影响策略的收敛效果。
id="%EF%BC%88%E4%BA%8C%EF%BC%89Double%20DQN%20%E7%9A%84%E6%A0%B8%E5%BF%83%E6%80%9D%E6%83%B3%EF%BC%9A%E2%80%9C%E9%80%89%E5%8A%A8%E4%BD%9C%E2%80%9D%20%E5%92%8C%20%E2%80%9C%E8%AF%84%E4%BB%B7%E5%80%BC%E2%80%9D%20%E8%A7%A3%E8%80%A6"
name="%EF%BC%88%E4%BA%8C%EF%BC%89Double%20DQN%20%E7%9A%84%E6%A0%B8%E5%BF%83%E6%80%9D%E6%83%B3%EF%BC%9A%E2%80%9C%E9%80%89%E5%8A%A8%E4%BD%9C%E2%80%9D%20%E5%92%8C%20%E2%80%9C%E8%AF%84%E4%BB%B7%E5%80%BC%E2%80%9D%20%E8%A7%A3%E8%80%A6">(二)Double
网络(q_net)选动作,用目标
网络(target_q_net)评价值,避免同一网络的误差被最大化操作放大。
id="Double%20DQN%20%E7%9A%84%E7%9B%AE%E6%A0%87%20Q%20%E5%80%BC%E8%AE%A1%E7%AE%97%E9%80%BB%E8%BE%91"
name="Double%20DQN%20%E7%9A%84%E7%9B%AE%E6%A0%87%20Q%20%E5%80%BC%E8%AE%A1%E7%AE%97%E9%80%BB%E8%BE%91">Double
值计算逻辑
分为两步:
src="https://i-blog.csdnimg.cn/direct/1320886df1634a57a9873cba64f87333.png"
src="https://i-blog.csdnimg.cn/direct/9a981e5ac25a406c945fca950bf97b98.png"
width="622">
src="https://i-blog.csdnimg.cn/direct/d8a6405601ae4b56a1bd497a21ad4d13.png"
src="https://i-blog.csdnimg.cn/direct/61cbe7133aa74b7a85ee7875b0ecd322.png"
id="%EF%BC%88%E4%B8%89%EF%BC%89Double%20DQN%20%E4%B8%8E%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E6%A0%B8%E5%BF%83%E5%AF%B9%E6%AF%94"
name="%EF%BC%88%E4%B8%89%EF%BC%89Double%20DQN%20%E4%B8%8E%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E6%A0%B8%E5%BF%83%E5%AF%B9%E6%AF%94">(三)Double
的核心对比
DQN类的update函数是区分两种算法的核心,我把关键代码和原理对应起来,进行讲解:
id="1.%20%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E5%AE%9E%E7%8E%B0%EF%BC%88%E4%BB%A3%E7%A0%81%E7%89%87%E6%AE%B5%EF%BC%89"
name="1.%20%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E5%AE%9E%E7%8E%B0%EF%BC%88%E4%BB%A3%E7%A0%81%E7%89%87%E6%AE%B5%EF%BC%89">1.
self.target_q_net(next_states).max(1)[0].view(-1,
target_q_net(目标网络)中取下一状态的最大src="https://i-blog.csdnimg.cn/direct/b3b4ab92ed204a6aa65536001f6866e0.png"
width="344">
id="2.%20Double%20DQN%20%E7%9A%84%E5%AE%9E%E7%8E%B0%EF%BC%88%E4%BB%A3%E7%A0%81%E7%89%87%E6%AE%B5%EF%BC%89"
name="2.%20Double%20DQN%20%E7%9A%84%E5%AE%9E%E7%8E%B0%EF%BC%88%E4%BB%A3%E7%A0%81%E7%89%87%E6%AE%B5%EF%BC%89">2.
self.q_net(next_states).max(1)[1].view(-1,
self.target_q_net(next_states).gather(1,
self.q_net(next_states).max(1)[1]:当前网络(q_net)计算下一状态的所有动作值对应的动作索引(argmax);
target_q_net(...).gather(1,max_action)
:目标网络(target_q_net)根据当前网络选的动作索引,计算该动作的值(而非直接取最大值);
id="%EF%BC%88%E5%9B%9B%EF%BC%89Double%20DQN%20%E7%9A%84%E5%AE%8C%E6%95%B4%E6%89%A7%E8%A1%8C%E6%B5%81%E7%A8%8B"
name="%EF%BC%88%E5%9B%9B%EF%BC%89Double%20DQN%20%E7%9A%84%E5%AE%8C%E6%95%B4%E6%89%A7%E8%A1%8C%E6%B5%81%E7%A8%8B">(四)Double
的完整训练流程如下:
take_action函数选择动作(ε-贪心策略),与环境交互得到(s,a,r,s′,done);
ReplayBuffer;src="https://i-blog.csdnimg.cn/direct/ab3a2104156d4cf0b9f856ab4bbc4308.png"
src="https://i-blog.csdnimg.cn/direct/88e3488467884726880f8f9fbd8e9bcc.png"
src="https://i-blog.csdnimg.cn/direct/df6c6bf4bd33429588d949b77de587da.png"
width="502">);
target_update步同步到目标网络;id="%EF%BC%88%E4%BA%94%EF%BC%89Double%20DQN%20%E7%9A%84Python%E4%BB%A3%E7%A0%81%E5%AE%8C%E6%95%B4%E5%AE%9E%E7%8E%B0"
name="%EF%BC%88%E4%BA%94%EF%BC%89Double%20DQN%20%E7%9A%84Python%E4%BB%A3%E7%A0%81%E5%AE%8C%E6%95%B4%E5%AE%9E%E7%8E%B0"
的Python代码完整实现
先实现rl_utils库,它包含一些函数,如绘制移动平均曲线、计算优势函数等,不同的算法可以一起使用这些函数。
rl_utils.py中的Python代码如下:
collections.deque(maxlen=capacity)def
add(self,
done):self.buffer.append((state,
action,
np.cumsum(a[:window_size-1])[::2]
rend
(np.cumsum(a[:-window_size:-1])[::2]
r)[::-1]return
tqdm(total=int(num_episodes/10),
desc='Iteration
range(int(num_episodes/10)):episode_return
0transition_dict
agent.take_action(state)next_state,
reward,
env.step(action)transition_dict['states'].append(state)transition_dict['actions'].append(action)transition_dict['next_states'].append(next_state)transition_dict['rewards'].append(reward)transition_dict['dones'].append(done)state
rewardreturn_list.append(episode_return)agent.update(transition_dict)if
(i_episode+1)
np.mean(return_list[-10:])})pbar.update(1)return
return_list
tqdm(total=int(num_episodes/10),
desc='Iteration
range(int(num_episodes/10)):episode_return
0state
agent.take_action(state)next_state,
reward,
env.step(action)replay_buffer.add(state,
action,
replay_buffer.sample(batch_size)transition_dict
{'states':
b_d}agent.update(transition_dict)return_list.append(episode_return)if
(i_episode+1)
np.mean(return_list[-10:])})pbar.update(1)return
return_list
td_delta.detach().numpy()advantage_list
[]advantage
deltaadvantage_list.append(advantage)advantage_list.reverse()return
dtype=torch.float)
Double collections.deque(maxlen=capacity) 将数据加入bufferself.buffer.append((state, 从buffer中采样数据,数量为batch_sizetransitions window_size):'''滑动平均函数'''cumulative_sum np.cumsum(a[:window_size-1])[::2] (np.cumsum(a[:-window_size:-1])[::2] Qnet(torch.nn.Module):'''只有一层隐藏层的Q网络'''def dqn_type='VanillaDQN'):self.action_dim self.action_dim).to(device)self.target_q_net self.action_dim).to(device)self.optimizer torch.optim.Adam(self.q_net.parameters(), np.random.randint(self.action_dim)else:# 优化张量创建:先转numpy数组再创建张量,避免警告state torch.tensor(np.array([state]), dtype=torch.float32).to(self.device)action self.q_net(state).argmax().item()return torch.tensor(np.array([state]), dtype=torch.float32).to(self.device)return self.q_net(state).max().item()def torch.tensor(transition_dict['states'], dtype=torch.float32).to(self.device)actions torch.tensor(transition_dict['actions']).view(-1, torch.tensor(transition_dict['rewards'], torch.tensor(transition_dict['next_states'], dtype=torch.float32).to(self.device)dones torch.tensor(transition_dict['dones'], self.q_net(next_states).max(1)[1].view(-1, self.target_q_net(next_states).gather(1, self.target_q_net(next_states).max(1)[0].view(-1, torch.mean(F.mse_loss(q_values, q_targets))self.optimizer.zero_grad()dqn_loss.backward()self.optimizer.step()if 0:self.target_q_net.load_state_dict(self.q_net.state_dict())self.count agent.take_action(state)max_q_value 平滑处理max_q_value_list.append(max_q_value) env.step([action_continuous])done replay_buffer.sample(batch_size)transition_dict b_d}agent.update(transition_dict)return_list.append(episode_return)if np.mean(return_list[-10:])})pbar.update(1)return plt.savefig('DQN_Pendulum_returns.png', list(range(len(max_q_value_list))) plt.savefig('DQN_Pendulum_q_value.png', plt.savefig('DoubleDQN_Pendulum_returns.png', list(range(len(max_q_value_list))) plt.savefig('DoubleDQN_Pendulum_q_value.png', id="%EF%BC%88%E5%85%AD%EF%BC%89%E7%A8%8B%E5%BA%8F%E8%BF%90%E8%A1%8C%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA%E4%B8%8E%E5%88%86%E6%9E%90" name="%EF%BC%88%E5%85%AD%EF%BC%89%E7%A8%8B%E5%BA%8F%E8%BF%90%E8%A1%8C%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA%E4%B8%8E%E5%88%86%E6%9E%90" id="%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA" name="%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA">结果展示 Iteration src="https://i-blog.csdnimg.cn/direct/b5250b21b5374454b51b678720d2e5e0.png" src="https://i-blog.csdnimg.cn/direct/cd5ad649003d4a27a2c8f17db9f2ec0e.png" src="https://i-blog.csdnimg.cn/direct/567add2e358a4d7bb718f418351cb626.png" src="https://i-blog.csdnimg.cn/direct/9df5a1f484774b50aff1ef85bb77e805.png"队列,先进先出def
action,
batch_size)state,
0))middle
rend
r)[::-1]return
__init__(self,
action_dimself.q_net
Qnet(state_dim,
gammaself.epsilon
actiondef
update(self,
actions)if
1)max_next_q_values
max_action)else:
1)q_targets
self.count
设置超参数
0.005
dis_to_con(action,
terminated
{'states':
(i_episode
return_list,
dpi=300,
max_q_value_list)
dpi=300,
dpi=300,
max_q_value_list)
dpi=300,
rgba(0,
100%|██████████|
alt=""
100%|██████████|
alt=""
width="1723">
id="%E7%BB%93%E6%9E%9C%E5%88%86%E6%9E%90"
name="%E7%BB%93%E6%9E%9C%E5%88%86%E6%9E%90"
0)">结果分析
根据代码运行结果我们可以发现,DQN
id="%EF%BC%88%E4%B8%83%EF%BC%89Double%20DQN%20%E7%9A%84%E4%BC%98%E5%8A%BF%E4%B8%8E%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF"
name="%EF%BC%88%E4%B8%83%EF%BC%89Double%20DQN%20%E7%9A%84%E4%BC%98%E5%8A%BF%E4%B8%8E%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF">(七)Double
name="%E4%BC%98%E5%8A%BF">优势
值估计更接近真实值;
id="%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF"
name="%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF">适用场景
值估计精度要求高的场景(比如稀疏奖励环境,过估计会导致智能体学不到有效策略)。
id="%EF%BC%88%E5%85%AB%EF%BC%89%E5%85%B3%E9%94%AE%E6%80%BB%E7%BB%93"
name="%EF%BC%88%E5%85%AB%EF%BC%89%E5%85%B3%E9%94%AE%E6%80%BB%E7%BB%93"
0)">(八)关键总结
值过估计问题;
src="https://i-blog.csdnimg.cn/direct/b7bc6b6ecfb34ef593e469e815e5a75e.png"
src="https://i-blog.csdnimg.cn/direct/f0c787c9451c444d8d0a83eda9fbc927.png"
width="908">;
Double
id="%E4%B8%89%E3%80%81Dueling%20DQN%20%E7%AE%97%E6%B3%95"
name="%E4%B8%89%E3%80%81Dueling%20DQN%20%E7%AE%97%E6%B3%95">三、Dueling
id="%EF%BC%88%E4%B8%80%EF%BC%89Dueling%20DQN%20%E7%9A%84%E6%8F%90%E5%87%BA%E8%83%8C%E6%99%AF%EF%BC%9A%E4%BC%98%E5%8C%96%20Q%20%E5%80%BC%E7%9A%84%E5%AD%A6%E4%B9%A0%E7%BB%93%E6%9E%84"
name="%EF%BC%88%E4%B8%80%EF%BC%89Dueling%20DQN%20%E7%9A%84%E6%8F%90%E5%87%BA%E8%83%8C%E6%99%AF%EF%BC%9A%E4%BC%98%E5%8C%96%20Q%20%E5%80%BC%E7%9A%84%E5%AD%A6%E4%B9%A0%E7%BB%93%E6%9E%84">(一)Dueling
值(Q(s,a)),但这种设计存在一个问题:
Q
值包含两个核心信息
将这两个信息混在一起学习,会导致冗余和低效。
举个例子:在
无法区分这两种信息,学习效率会打折扣。
Dueling
“动作优势”,让网络分别学习这两种信息,提升学习效率和泛化能力。
id="%EF%BC%88%E4%BA%8C%EF%BC%89Dueling%20DQN%20%E7%9A%84%E6%A0%B8%E5%BF%83%E6%80%9D%E6%83%B3%EF%BC%9A%E6%8B%86%E5%88%86%20Q%20%E5%80%BC%E4%B8%BA%20V%20(s)%20%E5%92%8C%20A%20(s%2Ca)"
name="%EF%BC%88%E4%BA%8C%EF%BC%89Dueling%20DQN%20%E7%9A%84%E6%A0%B8%E5%BF%83%E6%80%9D%E6%83%B3%EF%BC%9A%E6%8B%86%E5%88%86%20Q%20%E5%80%BC%E4%B8%BA%20V%20(s)%20%E5%92%8C%20A%20(s%2Ca)">(二)Dueling
id="1.%20%E6%A0%B8%E5%BF%83%E5%85%AC%E5%BC%8F%EF%BC%9AQ%20%E5%80%BC%E7%9A%84%E6%8B%86%E8%A7%A3%E4%B8%8E%E9%87%8D%E6%9E%84"
name="1.%20%E6%A0%B8%E5%BF%83%E5%85%AC%E5%BC%8F%EF%BC%9AQ%20%E5%80%BC%E7%9A%84%E6%8B%86%E8%A7%A3%E4%B8%8E%E9%87%8D%E6%9E%84">1.
动作价值)拆分为两部分:Q(s,a)=V(s)+A(s,a)
“处于状态s本身有多好”(和动作无关);
“在状态s下选动作a比选其他动作好多少”。
id="2.%20%E5%85%B3%E9%94%AE%E6%94%B9%E8%BF%9B%EF%BC%9A%E8%A7%A3%E5%86%B3%20%E2%80%9C%E4%B8%8D%E5%8F%AF%E8%AF%86%E5%88%AB%E6%80%A7%E2%80%9D%20%E9%97%AE%E9%A2%98"
name="2.%20%E5%85%B3%E9%94%AE%E6%94%B9%E8%BF%9B%EF%BC%9A%E8%A7%A3%E5%86%B3%20%E2%80%9C%E4%B8%8D%E5%8F%AF%E8%AF%86%E5%88%AB%E6%80%A7%E2%80%9D%20%E9%97%AE%E9%A2%98">2.
会存在「不可识别性」:比如给V(s)加一个常数,同时给A(s,a)减同一个常数,Q(s,a)的值不变,但V和A的含义被破坏。
因此
src="https://i-blog.csdnimg.cn/direct/3df95b2b26c149ff87391c7ebdcafa27.png"
src="https://i-blog.csdnimg.cn/direct/3f759ece2ce842f9987c453a91f73586.png"
width="274">:当前状态下所有动作优势值的均值;
id="%EF%BC%88%E4%B8%89%EF%BC%89Dueling%20DQN%20%E7%9A%84%E7%BD%91%E7%BB%9C%E7%BB%93%E6%9E%84"
name="%EF%BC%88%E4%B8%89%EF%BC%89Dueling%20DQN%20%E7%9A%84%E7%BD%91%E7%BB%9C%E7%BB%93%E6%9E%84">(三)Dueling
的网络结构
VAnet类是
id="1.%20%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%20Qnet%EF%BC%88%E5%8D%95%E5%B1%82%E9%9A%90%E8%97%8F%E5%B1%82%EF%BC%89"
name="1.%20%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%20Qnet%EF%BC%88%E5%8D%95%E5%B1%82%E9%9A%90%E8%97%8F%E5%B1%82%EF%BC%89">1.
id="2.%20Dueling%20DQN%20%E7%9A%84%20VAnet%EF%BC%88%E6%8B%86%E5%88%86%20V%20%E5%92%8C%20A%EF%BC%89"
name="2.%20Dueling%20DQN%20%E7%9A%84%20VAnet%EF%BC%88%E6%8B%86%E5%88%86%20V%20%E5%92%8C%20A%EF%BC%89">2.
代码逐行解析:
self.fc1:所有分支共享的特征提取层(避免重复学习状态特征);self.fc_A:输出每个动作的优势值A(s,a),形状是[batch_size,action_dim]
;self.fc_V:输出状态价值V(s),形状是[batch_size,1]
;A.mean(1).view(-1,1)
:计算每个样本的优势值均值(mean(1)按行求均值,view保证维度匹配);QA.mean(1).view(-1,
id="%EF%BC%88%E5%9B%9B%EF%BC%89Dueling%20DQN%20%E7%9A%84%E8%AE%AD%E7%BB%83%E9%80%BB%E8%BE%91%EF%BC%88%E5%92%8C%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E5%BC%82%E5%90%8C%EF%BC%89"
name="%EF%BC%88%E5%9B%9B%EF%BC%89Dueling%20DQN%20%E7%9A%84%E8%AE%AD%E7%BB%83%E9%80%BB%E8%BE%91%EF%BC%88%E5%92%8C%E5%8E%9F%E5%A7%8B%20DQN%20%E7%9A%84%E5%BC%82%E5%90%8C%EF%BC%89">(四)Dueling
DQN
的异同)
DQN类的初始化和更新逻辑是兼容
Dueling
id="1.%20%E5%88%9D%E5%A7%8B%E5%8C%96%E6%97%B6%E7%9A%84%E7%BD%91%E7%BB%9C%E9%80%89%E6%8B%A9"
name="1.%20%E5%88%9D%E5%A7%8B%E5%8C%96%E6%97%B6%E7%9A%84%E7%BD%91%E7%BB%9C%E9%80%89%E6%8B%A9">1.
self.action_dim).to(device)self.target_q_net
VAnet(state_dim,
self.action_dim).to(device)self.target_q_net
Qnet(state_dim,
self.action_dim).to(device)
用Qnet;
VAnet还是Qnet,最终输出的都是[batch_size,值,因此后续的take_action、update逻辑完全复用(这是
Dueling
id="2.%20%E8%AE%AD%E7%BB%83%20%2F%20%E6%9B%B4%E6%96%B0%E9%80%BB%E8%BE%91"
name="2.%20%E8%AE%AD%E7%BB%83%20%2F%20%E6%9B%B4%E6%96%B0%E9%80%BB%E8%BE%91">2.
更新逻辑
Dueling
dones)
;src="https://i-blog.csdnimg.cn/direct/3d377d97a2bb4b4690935cd31fe1b8ff.png"
width="498">);
核心差异:只是self.q_net的内部结构不同(拆分
id="%EF%BC%88%E4%BA%94%EF%BC%89Dueling%20DQN%20%E7%9A%84Python%E4%BB%A3%E7%A0%81%E5%AE%8C%E6%95%B4%E5%AE%9E%E7%8E%B0"
name="%EF%BC%88%E4%BA%94%EF%BC%89Dueling%20DQN%20%E7%9A%84Python%E4%BB%A3%E7%A0%81%E5%AE%8C%E6%95%B4%E5%AE%9E%E7%8E%B0"
collections.deque(maxlen=capacity)
将数据加入bufferself.buffer.append((state,
从buffer中采样数据,数量为batch_sizetransitions
np.cumsum(a[:window_size-1])[::2]
(np.cumsum(a[:-window_size:-1])[::2]
self.fc_A(F.relu(self.fc1(x)))V
self.fc_V(F.relu(self.fc1(x)))Q
__init__(self,state_dim,hidden_dim,action_dim,learning_rate,gamma,epsilon,target_update,device,dqn_type='VanillaDQN'):self.action_dim
hidden_dim,self.action_dim).to(device)self.target_q_net
hidden_dim,self.action_dim).to(device)else:self.q_net
hidden_dim,self.action_dim).to(device)self.target_q_net
hidden_dim,self.action_dim).to(device)self.optimizer
torch.optim.Adam(self.q_net.parameters(),lr=learning_rate)self.gamma
np.random.randint(self.action_dim)else:#
优化张量创建:先转numpy数组再创建张量,消除效率警告state
torch.tensor(np.array([state]),
dtype=torch.float32).to(self.device)action
self.q_net(state).argmax().item()return
torch.tensor(np.array([state]),
dtype=torch.float32).to(self.device)return
self.q_net(state).max().item()def
torch.tensor(transition_dict['states'],dtype=torch.float32).to(self.device)actions
torch.tensor(transition_dict['actions']).view(-1,
torch.tensor(transition_dict['rewards'],dtype=torch.float32).view(-1,
torch.tensor(transition_dict['next_states'],dtype=torch.float32).to(self.device)dones
torch.tensor(transition_dict['dones'],dtype=torch.float32).view(-1,
self.q_net(next_states).max(1)[1].view(-1,
self.target_q_net(next_states).gather(1,
max_action)else:max_next_q_values
self.target_q_net(next_states).max(1)[0].view(-1,
torch.mean(F.mse_loss(q_values,
q_targets))self.optimizer.zero_grad()dqn_loss.backward()self.optimizer.step()if
0:self.target_q_net.load_state_dict(self.q_net.state_dict())self.count
agent.take_action(state)max_q_value
平滑处理max_q_value_list.append(max_q_value)
env.step([action_continuous])done
replay_buffer.sample(batch_size)transition_dict
b_d}agent.update(transition_dict)return_list.append(episode_return)if
np.mean(return_list[-10:])})pbar.update(1)return
使用本地定义的ReplayBuffer,避免rl_utils依赖
plt.savefig('DuelingDQN_Pendulum_returns.png',
list(range(len(max_q_value_list)))
plt.savefig('DuelingDQN_Pendulum_q_value.png',
id="%EF%BC%88%E5%85%AD%EF%BC%89%E7%A8%8B%E5%BA%8F%E8%BF%90%E8%A1%8C%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA%E4%B8%8E%E5%88%86%E6%9E%90"
name="%EF%BC%88%E5%85%AD%EF%BC%89%E7%A8%8B%E5%BA%8F%E8%BF%90%E8%A1%8C%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA%E4%B8%8E%E5%88%86%E6%9E%90"
id="%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA"
name="%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA">结果展示
Iteration
src="https://i-blog.csdnimg.cn/direct/e07042b152cc4424b1c567700fd9e582.png"
src="https://i-blog.csdnimg.cn/direct/5052f1eef18245ffbd210e57989f8a33.png"
id="%E7%BB%93%E6%9E%9C%E5%88%86%E6%9E%90"
name="%E7%BB%93%E6%9E%9C%E5%88%86%E6%9E%90">结果分析
根据代码运行结果我们可以发现,相比于传统的
id="%EF%BC%88%E4%B8%83%EF%BC%89Dueling%20DQN%20%E7%9A%84%E4%BC%98%E5%8A%BF%E4%B8%8E%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF"
name="%EF%BC%88%E4%B8%83%EF%BC%89Dueling%20DQN%20%E7%9A%84%E4%BC%98%E5%8A%BF%E4%B8%8E%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF">(七)Dueling
id="%E6%A0%B8%E5%BF%83%E4%BC%98%E5%8A%BF"
name="%E6%A0%B8%E5%BF%83%E4%BC%98%E5%8A%BF">核心优势
“动作的相对优势”,避免信息混叠导致的冗余学习;
只需微调动作选择;
DQN),进一步提升性能;
id="%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF"
name="%E9%80%82%E7%94%A8%E5%9C%BA%E6%99%AF">适用场景
Pendulum(摆杆位置决定核心价值)、迷宫类环境(是否靠近终点决定核心价值);
值,降低学习难度;
id="%EF%BC%88%E5%85%AB%EF%BC%89Dueling%20DQN%20vs%20Double%20DQN%EF%BC%9A%E6%A0%B8%E5%BF%83%E5%8C%BA%E5%88%AB"
name="%EF%BC%88%E5%85%AB%EF%BC%89Dueling%20DQN%20vs%20Double%20DQN%EF%BC%9A%E6%A0%B8%E5%BF%83%E5%8C%BA%E5%88%AB">(八)Dueling
DQN:核心区别
很多人会混淆这两种算法,这里做清晰对比:
| 维度 | DoubleDQN | Dueling DQN |
|---|---|---|
| 核心目标 | 缓解“选动作”“动作优势”) | |
| 改进层面 | 目标 网络的内部结构 | |
| 对代码的修改点 | update函数中目标Qnet) | |
| 兼容性 | 可与Dueling等结合 |
id="%EF%BC%88%E4%B9%9D%EF%BC%89%E5%85%B3%E9%94%AE%E6%80%BB%E7%BB%93"
name="%EF%BC%88%E4%B9%9D%EF%BC%89%E5%85%B3%E9%94%AE%E6%80%BB%E7%BB%93">(九)关键总结
值拆分为状态价值V(s)(和动作无关)和动作优势A(s,a)(和动作有关),并通过中心化处理(A−Aˉ)解决不可识别性问题;
VAnet类是关键值;
更新流程无需修改;
id="%E5%9B%9B%E3%80%81%E6%80%BB%E7%BB%93"
name="%E5%9B%9B%E3%80%81%E6%80%BB%E7%BB%93">四、总结
在传统的
能够很好地学习到不同动作的差异性,在动作空间较大的环境下非常有效。
从
的方法原理中,我们也能感受到深度强化学习的研究是在关注深度学习和强化学习有效结合:一是在深度学习的模块的基础上,强化学习方法如何更加有效地工作,并避免深度模型学习行为带来的一些问题,例如使用
解决Q值过高估计的问题;二是在强化学习的场景下,深度学习模型如何有效学习到有用的模式,例如设计
网络架构来高效地学习状态价值函数以及动作优势函数。
class="post-meta-container">
作为专业的SEO优化服务提供商,我们致力于通过科学、系统的搜索引擎优化策略,帮助企业在百度、Google等搜索引擎中获得更高的排名和流量。我们的服务涵盖网站结构优化、内容优化、技术SEO和链接建设等多个维度。
| 服务项目 | 基础套餐 | 标准套餐 | 高级定制 |
|---|---|---|---|
| 关键词优化数量 | 10-20个核心词 | 30-50个核心词+长尾词 | 80-150个全方位覆盖 |
| 内容优化 | 基础页面优化 | 全站内容优化+每月5篇原创 | 个性化内容策略+每月15篇原创 |
| 技术SEO | 基本技术检查 | 全面技术优化+移动适配 | 深度技术重构+性能优化 |
| 外链建设 | 每月5-10条 | 每月20-30条高质量外链 | 每月50+条多渠道外链 |
| 数据报告 | 月度基础报告 | 双周详细报告+分析 | 每周深度报告+策略调整 |
| 效果保障 | 3-6个月见效 | 2-4个月见效 | 1-3个月快速见效 |
我们的SEO优化服务遵循科学严谨的流程,确保每一步都基于数据分析和行业最佳实践:
全面检测网站技术问题、内容质量、竞争对手情况,制定个性化优化方案。
基于用户搜索意图和商业目标,制定全面的关键词矩阵和布局策略。
解决网站技术问题,优化网站结构,提升页面速度和移动端体验。
创作高质量原创内容,优化现有页面,建立内容更新机制。
获取高质量外部链接,建立品牌在线影响力,提升网站权威度。
持续监控排名、流量和转化数据,根据效果调整优化策略。
基于我们服务的客户数据统计,平均优化效果如下:
我们坚信,真正的SEO优化不仅仅是追求排名,而是通过提供优质内容、优化用户体验、建立网站权威,最终实现可持续的业务增长。我们的目标是与客户建立长期合作关系,共同成长。
Demand feedback