Soft Actor-Critic (SAC)算法

embedded/2024/12/28 17:40:06/

代码

python">import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pygame# 定义 Actor 网络
class Actor(nn.Module):def __init__(self, state_dim, action_dim, max_action):super(Actor, self).__init__()self.fc1 = nn.Linear(state_dim, 256)self.fc2 = nn.Linear(256, 256)self.mu = nn.Linear(256, action_dim)self.log_std = nn.Linear(256, action_dim)self.max_action = max_actiondef forward(self, state):x = F.relu(self.fc1(state))x = F.relu(self.fc2(x))mu = self.mu(x)log_std = self.log_std(x)log_std = torch.clamp(log_std, -20, 2)std = torch.exp(log_std)return mu, stddef sample(self, state):mu, std = self.forward(state)dist = torch.distributions.Normal(mu, std)action = dist.rsample()action = torch.tanh(action) * self.max_actionlog_prob = dist.log_prob(action).sum(axis=-1)log_prob -= (2 * (np.log(2) - action - F.softplus(-2 * action))).sum(axis=-1)return action, log_prob# 定义 Critic 网络
class Critic(nn.Module):def __init__(self, state_dim, action_dim):super(Critic, self).__init__()self.fc1 = nn.Linear(state_dim + action_dim, 256)self.fc2 = nn.Linear(256, 256)self.fc3 = nn.Linear(256, 1)def forward(self, state, action):x = torch.cat([state, action], 1)x = F.relu(self.fc1(x))x = F.relu(self.fc2(x))x = self.fc3(x)return x# SAC 算法
class SAC:def __init__(self, state_dim, action_dim, max_action):self.actor = Actor(state_dim, action_dim, max_action)self.critic1 = Critic(state_dim, action_dim)self.critic2 = Critic(state_dim, action_dim)self.target_critic1 = Critic(state_dim, action_dim)self.target_critic2 = Critic(state_dim, action_dim)self.target_critic1.load_state_dict(self.critic1.state_dict())self.target_critic2.load_state_dict(self.critic2.state_dict())self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4)self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=3e-4)self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=3e-4)self.log_alpha = torch.tensor(np.log(0.1), requires_grad=True)self.alpha_optimizer = optim.Adam([self.log_alpha], lr=3e-4)self.gamma = 0.99self.tau = 0.005def select_action(self, state):state = torch.FloatTensor(state.reshape(1, -1))  # 确保 state 是 (1, state_dim) 的形状action, _ = self.actor.sample(state)return action.cpu().data.numpy().flatten()def update(self, replay_buffer, batch_size=256):state, action, next_state, reward, done = replay_buffer.sample(batch_size)state = torch.FloatTensor(state)action = torch.FloatTensor(action)next_state = torch.FloatTensor(next_state)reward = torch.FloatTensor(reward).unsqueeze(1)done = torch.FloatTensor(done).unsqueeze(1)with torch.no_grad():next_action, next_log_prob = self.actor.sample(next_state)target_q1 = self.target_critic1(next_state, next_action)target_q2 = self.target_critic2(next_state, next_action)target_q = torch.min(target_q1, target_q2) - self.log_alpha.exp() * next_log_probtarget_q = reward + (1 - done) * self.gamma * target_qcurrent_q1 = self.critic1(state, action)current_q2 = self.critic2(state, action)critic1_loss = F.mse_loss(current_q1, target_q)critic2_loss = F.mse_loss(current_q2, target_q)self.critic1_optimizer.zero_grad()critic1_loss.backward()self.critic1_optimizer.step()self.critic2_optimizer.zero_grad()critic2_loss.backward()self.critic2_optimizer.step()action_new, log_prob = self.actor.sample(state)q1_new = self.critic1(state, action_new)q2_new = self.critic2(state, action_new)q_new = torch.min(q1_new, q2_new)actor_loss = (self.log_alpha.exp() * log_prob - q_new).mean()self.actor_optimizer.zero_grad()actor_loss.backward()self.actor_optimizer.step()alpha_loss = -(self.log_alpha * (log_prob + 1).detach()).mean()self.alpha_optimizer.zero_grad()alpha_loss.backward()self.alpha_optimizer.step()for param, target_param in zip(self.critic1.parameters(), self.target_critic1.parameters()):target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)for param, target_param in zip(self.critic2.parameters(), self.target_critic2.parameters()):target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)# 简单的 Replay Buffer
class ReplayBuffer:def __init__(self, max_size=1e6):self.buffer = []self.max_size = int(max_size)  # 将 max_size 转换为整数self.ptr = 0def add(self, state, action, next_state, reward, done):if len(self.buffer) < self.max_size:self.buffer.append(None)self.buffer[self.ptr] = (state, action, next_state, reward, done)self.ptr = (self.ptr + 1) % self.max_sizedef sample(self, batch_size):indices = np.random.randint(0, len(self.buffer), batch_size)states, actions, next_states, rewards, dones = [], [], [], [], []for idx in indices:state, action, next_state, reward, done = self.buffer[idx]states.append(state)actions.append(action)next_states.append(next_state)rewards.append(reward)dones.append(done)return np.array(states), np.array(actions), np.array(next_states), np.array(rewards), np.array(dones)# 训练 SAC 算法
env = gym.make('Pendulum-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])sac = SAC(state_dim, action_dim, max_action)
replay_buffer = ReplayBuffer()max_episodes = 1000
batch_size = 256for episode in range(max_episodes):state = env.reset()if isinstance(state, tuple):  # 如果返回的是元组,提取状态state = state[0]episode_reward = 0done = Falsewhile not done:env.render()action = sac.select_action(state)next_state, reward, done, info = env.step(action)replay_buffer.add(state, action, next_state, reward, done)state = next_stateepisode_reward += rewardif len(replay_buffer.buffer) > batch_size:sac.update(replay_buffer, batch_size)print(f"Episode {episode + 1}, Reward: {episode_reward}")env.close()

简介

Soft Actor-Critic (SAC) 是一种基于最大熵(Maximum Entropy)的深度强化学习算法,专为连续动作空间设计。它结合了 Actor-Critic 框架和熵正则化(Entropy Regularization),在探索与利用之间取得了良好的平衡。

图片

在这里插入图片描述


http://www.ppmy.cn/embedded/149505.html

相关文章

1227java面经

1&#xff0c;HTTP 请求报文是如何组成的? HTTP 请求报文主要由以下几个部分组成&#xff1a; 请求行&#xff08;Request Line&#xff09; 这是请求报文的起始行&#xff0c;包含了请求方法、请求的 URL&#xff08;统一资源定位符&#xff09;以及所遵循的 HTTP 协议版本…

Python软体中简化版MapReduce任务的实现:处理大量日志数据

Python软体中简化版MapReduce任务的实现:处理大量日志数据 引言 在大数据时代,日志数据的处理与分析变得尤为重要。无论是服务器日志、应用程序日志还是用户行为日志,如何高效地处理和分析这些数据是每个开发者和数据科学家面临的挑战。MapReduce是一种编程模型,能够有效…

突发!GitLab将停止对中国区用户提供GitLab.com账号服务

突发!GitLab将停止对中国区用户提供GitLab.com账号服务 近日,被视为全球第二大开源代码托管和项目管理平台的 GitLab 宣布其将对中国区用户停止提供 GitLab.com 账号服务,建议现有用户迁移到极狐。中国 IP 地址现在访问 GitLab.com 页面会弹出下面窗口且直接转到 about.git…

.net core 的字符串处理

Python基础 引言 Python是一种广泛使用的高级编程语言&#xff0c;由Guido van Rossum于1991年首次发布。其设计理念强调代码的可读性和简洁性&#xff0c;使得Python成为初学者和专业开发者的热门选择。Python支持多种编程范式&#xff0c;包括面向对象、过程式和函数式编程…

探索 DC-SDK:强大的 3D 地图开发框架

在现代 Web 开发中&#xff0c;地理信息系统&#xff08;GIS&#xff09;和 3D 地图可视化变得越来越重要。dc-sdk 是一个基于 Cesium 的开源 WebGL 地图开发框架&#xff0c;它提供了丰富的地图可视化功能和简单易用的 API&#xff0c;使开发者能够轻松地在 Web 应用中集成 3D…

ID卡网络读卡器C#小程序开发

ID卡全称为身份识别卡&#xff08;Identification Card&#xff09;&#xff0c;以下是对ID卡的详细介绍&#xff1a; 一、定义与分类 ID卡是一种不可写入的感应卡&#xff0c;含有固定的编号。按照规格和形状&#xff0c;它可以分为ID厚卡、标准卡&#xff08;85.6x54x0.800…

MySQL:SELECT list is not in GROUP BY clause 报错 解决方案

一、前言 一大早上测试环境&#xff0c;发现测试环境的MySQL报错了。 SELECT list is not in GROUP BY clause and contains nonaggregated column二、解决方案 官方文档中提到&#xff1a; 大致意思&#xff1a; 用于GROUP BY的SQL / 92标准要求满足以下条件&#xff1a; SE…

springboot maven 构建 建议使用 --release 21 而不是 -source 21 -target 21,因为它会自动设置系统模块的位置

使用 --release 选项代替 -source 和 -target 是一种更安全、更兼容的方式,特别是在构建使用较新版本 JDK 的项目时。以下是详细解释和建议: 1. 为什么推荐使用 --release 问题点: 使用 -source 和 -target 标志时,仅设置了代码的语言级别和字节码目标版本,但编译器仍可…