强化学习概述
强化学习(Reinforcement Learning,简称RL)是一种通过与环境互动学习最优行为的算法。它构建了一个代理(agent)模型,该代理在环境中采取行动,并根据行动的结果接收奖励(正或负)。RL 的目标是最大化代理在长期中的累计奖励,即通过试错学习最优策略。
强化学习进阶:Q 学习与深度 Q 网络(DQN)
Q 学习
Q 学习是无模型强化学习的一种具体实现,它通过学习每个状态-动作对的期望累计奖励(行动值函数)来指导决策。Q 学习的目标是找到使累计奖励最大化的策略。其核心更新规则基于贝尔曼最优性原理。
import numpy as np
def q_learning(Q, state, action, reward, next_state, learning_rate, discount_factor, explore_rate):
if np.random.rand() < explore_rate:
next_action = np.random.choice(range(Q.shape[1]))
else:
next_action = np.argmax(Q[next_state])
current_q = Q[state][action]
if next_state == len(Q) - 1:
future_q = reward
else:
future_q = reward + discount_factor * Q[next_state][next_action]
new_q = (1 - learning_rate) * current_q + learning_rate * future_q
Q[state][action] = new_q
return Q
深度 Q 网络(DQN)
DQN 是 Q 学习的深度学习扩展,使用深度神经网络来近似 Q 函数。它通过经验回放和目标网络来解决学习过程中的不稳定性问题,使得算法能够更加稳定地从历史经验中学习价值函数。
import tensorflow as tf
class DQN:
def __init__(self, session, input_size, output_size, name='main'):
self.session = session
with tf.variable_scope(name):
self.inputs_ = tf.placeholder(tf.float32, [None, input_size], name='inputs')
self.targets_ = tf.placeholder(tf.float32, [None, output_size], name='targets')
self.trainable_weights = []
self.build_network()
self.all_weights = tf.trainable_variables(scope=name)
self.predict = self.network.output
def build_network(self):
num_layers = 3
num_nodes = 50
self.network = {}
self.network['inputs'] = self.inputs_
for i in range(num_layers):
self.network[f'hidden_{i}'] = tf.layers.dense(self.network[f'hidden_{i-1}'] if i > 0 else self.network['inputs'],
num_nodes,
tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(0, 0.01),
bias_initializer=tf.constant_initializer(0.1),
name=f'hidden_{i}')
self.network[f'all_weights_{i}'] = tf.trainable_variables(scope=f'hiddens_{i}')
self.network['all_weights'] = self.network['all_weights'] + self.network[f'all_weights_{i}']
self.network['output'] = tf.layers.dense(self.network[f'hidden_{num_layers-1}'],
15,
kernel_initializer=tf.random_normal_initializer(0, 0.01),
bias_initializer=tf.constant_initializer(0.1),
name='output')
self.network['all_weights'] = self.network['all_weights'] + [self.network['output']]
def train_network(self, inputs, targets):
loss = tf.losses.mean_squared_error(self.targets_, self.network['output'])
self.optimize = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
return self.optimize, loss
def predict(self, inputs):
return self.session.run(self.network['output'], feed_dict={self.network['inputs']: inputs})
def update_target_network(self):
self.session.run(self.assign_ops)
强化学习进阶:策略梯度
策略梯度方法
策略梯度方法是一类直接优化策略的强化学习算法,它们通过调整策略参数来最大化长期累计奖励,无需直接学习值函数。这类方法基于策略梯度定理,通过评估策略参数的梯度来优化策略。
A2C(优势 actor-critic)
A2C 是一种结合策略梯度与值函数方法的算法,它不仅学习策略,还学习值函数,并结合优势函数来优化策略。A2C 的算法流程包括收集数据、计算优势估计、更新策略网络及值函数网络。
import torch
import torch.nn as nn
class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, action_std):
super(ActorCritic, self).__init__()
self.action_mean = nn.Parameter(torch.zeros(1, action_dim))
self.action_log_std = nn.Parameter(torch.zeros(1, action_dim))
self.action_std = action_std
self.actor = nn.Sequential(nn.Linear(state_dim, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_dim))
self.critic = nn.Sequential(nn.Linear(state_dim, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1))
def forward(self):
raise NotImplementedError
def act(self, state):
action_mean = self.actor(state).squeeze(dim=0)
return action_mean
def evaluate(self, state, action):
action_mean = self.actor(state)
action_logstd = self.action_log_std.expand_as(action_mean)
action_var = (action_logstd * self.action_std).exp().pow(2)
action_dist = Normal(action_mean, action_var)
action = action_dist.sample()
action_logprob = action_dist.log_prob(action)
state_value = self.critic(state)
return action_mean, action_logprob, state_value
PPO(近端策略优化)
PPO 是一种高效策略梯度算法,通过限制策略更新步长来提高稳定性和效率。它适用于各种强化学习任务,特别是在需要连续控制的环境中表现出色。
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
class PPO:
def __init__(self, state_dim, action_dim, action_std):
self.action_mean = nn.Parameter(torch.zeros(1, action_dim))
self.action_log_std = nn.Parameter(torch.zeros(1, action_dim))
self.action_std = action_std
self.actor = nn.Sequential(nn.Linear(state_dim, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_dim))
self.critic = nn.Sequential(nn.Linear(state_dim, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1))
self.optimizer = optim.Adam(self.parameters(), lr=0.0003)
def forward(self):
raise NotImplementedError
def act(self, state):
action_mean = self.actor(state).squeeze(dim=0)
return action_mean
def evaluate(self, state, action):
action_mean = self.actor(state)
action_logstd = self.action_log_std.expand_as(action_mean)
action_var = (action_logstd * self.action_std).exp().pow(2)
action_dist = Normal(action_mean, action_var)
action = action_dist.sample()
action_logprob = action_dist.log_prob(action)
state_value = self.critic(state)
return action_mean, action_logprob, state_value
def ppo_update(self, transitions):
states = torch.tensor(transitions['states'], dtype=torch.float32)
actions = torch.tensor(transitions['actions'], dtype=torch.float32)
rewards = torch.tensor(transitions['rewards'], dtype=torch.float32).unsqueeze(1)
next_states = torch.tensor(transitions['next_states'], dtype=torch.float32)
for i in range(5):
critic_value = self.critic(states)
dist = Categorical(logits=self.actor(states))
log_probs = dist.log_prob(actions)
ratios = torch.exp(log_probs - dist.log_prob(actions.detach()))
advantages = rewards - critic_value.squeeze()
surr1 = ratios * advantages
surr2 = torch.clamp(ratios, 1 - self.clip_param, 1 + self.clip_param) * advantages
loss = -torch.min(surr1, surr2) + self.gamma * self.lam * advantages * torch.exp(log_probs) + F.mse_loss(critic_value.squeeze(), rewards)
self.optimizer.zero_grad()
loss.mean().backward()
self.optimizer.step()
RLHF 简介与原理
RLHF 的概念与应用背景
RLHF 或者 Reinforcement Learning from Human Feedback,是一种利用人类反馈来优化强化学习模型的方法。它旨在解决模型输出与人类偏好不一致的问题,通过直接与人类反馈相连接,优化模型的决策过程。
RLHF 的核心流程
预训练模型与微调(可选)
在正式引入人类反馈之前,通常会使用大量数据对模型进行预训练,以建立基础的语义理解能力。
训练奖励模型(裁判)
构建奖励模型作为人类反馈的量化工具,训练模型学习从用户提供的排序中提取反馈价值。
强化学习策略微调(优化语言模型)
利用强化学习框架,通过 RLHF 算法(如PPO)优化语言模型的策略,使其输出更符合人类期望。
RLHF 实践案例与步骤微调解码器架构模型任务表述
对于基于语言模型的任务(如生成对话、文本摘要等),RLHF 可以通过调整语言模型的输出策略来优化生成质量。
训练奖励模型
奖励模型通过学习人类对多个输出的排序,可以量化模型生成内容的质量。这一过程可以使用深度学习框架实现,通过自定义损失函数训练模型,使其能将文本序列转换为一个数值评分。
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
import pandas as pd
df = pd.read_csv('feedback.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
class FeedbackDataset(torch.utils.data.Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
def __getitem__(self, index):
title1 = str(self.data.Title1[index])
title2 = str(self.data.Title2[index])
title1 = ' '.join(title1.split())
title2 = ' '.join(title2.split())
encoding = self.tokenizer(title1, title2, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': self.data['Label'][index]
}
def __len__(self):
return self.len
class FeedbackModel(torch.nn.Module):
def __init__(self, bert):
super().__init__()
self.bert = bert
self.dropout = torch.nn.Dropout(0.1)
self.output = torch.nn.Linear(768, 2)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
output = self.dropout(pooled_output)
return self.output(output)
batch_size = 8
model = FeedbackModel(model)
dset = FeedbackDataset(df, tokenizer, max_len=512)
train_loader = torch.utils.data.DataLoader(dset, batch_size=batch_size, shuffle=True)
optimizer = AdamW(model.parameters(), lr=1e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
for epoch in range(epochs):
for i, data in enumerate(train_loader):
input_ids = data['input_ids']
attention_mask = data['attention_mask']
labels = data['labels']
model.zero_grad()
outputs = model(input_ids, attention_mask)
loss = criterion(outputs.logits, labels)
loss.backward()
optimizer.step()
scheduler.step()
使用 RLHF 算法进行策略优化
RLHF 算法如 PPO 应用于微调语言模型策略,根据奖励模型的反馈进行策略更新,从而优化生成文本的质量。
RLHF 的优势与局限可能的优势
RLHF 通过直接接收人类反馈,提高了模型输出的品质和一致性,尤其是针对特定领域的应用时更为明显。
面临的挑战与局限性
RLHF 在大规模数据处理和模型优化方面的计算成本较高,且需要高质量的标注数据和反馈机制,这可能限制了其在某些场景的广泛应用。
结论与展望RLHF 在生成任务中的应用前景
随着自然语言处理技术的不断发展,RLHF 有望在更广泛的生成任务中发挥作用,提升模型的可解释性和上下文一致性。
RLHF 研究与实践的未来方向
未来的研究将侧重于提高 RLHF 方法的效率、减少标注成本、以及增强模型在复杂场景下的适应能力,同时探索与多模态任务的结合,推动 AI 技术在更多领域的创新应用。
共同学习,写下你的评论
评论加载中...
作者其他优质文章