我再使用强化学习工具​箱编写SAC智能体进​行训练时策略一直在上​下限波动,没有很好的​探索,而使用DDPG​智能体和PPO智能体​则是能够进行一些有效​的探索,请问这是什么​原因?

14 views (last 30 days)
希
on 31 Aug 2024
Answered: halleyhit on 2 Sep 2024
%main
% 观测空间和动作空间定义
% numObs = 11; %观测空间维度
% numAct = 4;%动作空间维度
numObs1 = 7; %观测空间维度
numAct1 = 3;%动作空间维度
%BS, EB, ,CL
actLowerLimit = [-100 ;-200 ; -50];
actUpperLimit = [100 ; 200 ; 50];
obsInfo = rlNumericSpec([numObs1 1]);
obsInfo.Name = 'ObservationSac1';
% 连续动作空间
actInfo = rlNumericSpec([numAct1 1],...
'LowerLimit',actLowerLimit,...
'UpperLimit',actUpperLimit);
actInfo.Name = 'ActionSac';
% 创建强化学习环境
env = rlFunctionEnv(obsInfo, actInfo, 'sacStepFunction', 'sacResetFunction');
% 定义网络结构参数
criticLayerSizes = [64 32];
actorLayerSizes = [64 32];
% 第一个Critic网络,SAC网络是同时输入观测状态与动作,然后输出一个Q值,每个critic网络有两个输入层;
% 观测状态输入层
obsPath = [
featureInputLayer(numObs1, Name="obsPathInLyr")
fullyConnectedLayer(criticLayerSizes(1))
reluLayer
fullyConnectedLayer(criticLayerSizes(1),Name="obsout")
];
% 动作状态输入层
actPath = [
featureInputLayer(numAct1, Name="actPathInLyr")
fullyConnectedLayer(criticLayerSizes(1))
reluLayer
fullyConnectedLayer(criticLayerSizes(1),Name="actout")
];
% 合并路径
comPath = [
concatenationLayer(1,2,Name="cct")
fullyConnectedLayer(criticLayerSizes(2))
reluLayer
fullyConnectedLayer(1, Name="output")
];
%创建critic网络
criticNetwork = dlnetwork();
criticNetwork = addLayers(criticNetwork,obsPath);
criticNetwork = addLayers(criticNetwork,actPath);
criticNetwork = addLayers(criticNetwork,comPath);
criticNetwork = connectLayers(criticNetwork,"obsout","cct/in1");
criticNetwork = connectLayers(criticNetwork,"actout","cct/in2");
critic11 = rlQValueFunction(criticNetwork,obsInfo,actInfo, ...
ActionInputNames="actPathInLyr", ...
ObservationInputNames="obsPathInLyr");
critic12 = rlQValueFunction(criticNetwork,obsInfo,actInfo, ...
ActionInputNames="actPathInLyr", ...
ObservationInputNames="obsPathInLyr");
% 创建策略网络(Actor Network),SAC的策略网络输入为观测状态,输出为动作的均值与方差,网络结构为单一输入两输出
% 输入层
inPath = [
featureInputLayer( ...
numObs1, ...
Name="netOin")
reluLayer
fullyConnectedLayer( ...
actorLayerSizes(1), ...
Name="nethid")
reluLayer
fullyConnectedLayer( ...
actorLayerSizes(2), ...
Name="infc")
];
meanPath = [
% tanhLayer(Name="tanhMean");
fullyConnectedLayer(numAct1,Name="FCMean");
% scalingLayer(Name="scale", ...
% Scale=actUpperLimit),
];
sdevPath = [
reluLayer(Name="reluStdv");
fullyConnectedLayer(numAct1,Name="FCStdv");
softplusLayer(Name="splus")
];
actorNetwork = dlnetwork();
actorNetwork = addLayers(actorNetwork,inPath);
actorNetwork = addLayers(actorNetwork,meanPath);
actorNetwork = addLayers(actorNetwork,sdevPath);
% actorNetwork = connectLayers(actorNetwork,"infc","tanhMean/in");
% actorNetwork = connectLayers(actorNetwork,"infc","tanhStdv/in");
actorNetwork = connectLayers(actorNetwork,"infc","FCMean/in");
actorNetwork = connectLayers(actorNetwork,"infc","reluStdv/in");
% 创建随机策略表示(高斯采样)
actor1 = rlContinuousGaussianActor(actorNetwork, obsInfo, actInfo, ...
ActionMeanOutputNames="FCMean",...
ActionStandardDeviationOutputNames="splus",...
ObservationInputNames="netOin");
act = getAction(actor1,{rand(obsInfo.Dimension)});
dist = evaluate(actor1,{rand(obsInfo.Dimension)});
% 评估网络训练设置
criticOptions = rlOptimizerOptions( ...
Optimizer="adam", ...
LearnRate=1e-3,...
GradientThreshold=1, ...
L2RegularizationFactor=2e-4);
%动作网络训练设置
actorOptions = rlOptimizerOptions( ...
Optimizer="adam", ...
LearnRate=1e-3,...
GradientThreshold=1, ...
L2RegularizationFactor=1e-5);
% 定义SAC智能体选项
sacOptions = rlSACAgentOptions(...
'TargetSmoothFactor',1e-3,... % 目标网络平滑系数
'ExperienceBufferLength',5000,... % 经验缓冲区大小
'MiniBatchSize',256,... % 小批量大小
'DiscountFactor',0.99,... % 折扣因子
'SampleTime',1,... % 采样时间
'CriticOptimizerOptions', criticOptions,...
'ActorOptimizerOptions',actorOptions);
% 创建SAC智能体
agent1 = rlSACAgent(actor1,[critic11 critic12],sacOptions);
% 定义训练选项
trainOpts = rlTrainingOptions(...
'MaxEpisodes',500,... % 最大训练回合数
'MaxStepsPerEpisode',96,... % 每回合的最大步数
'Verbose',true,... % 不显示详细的训练信息
'Plots','training-progress',... % 显示训练进度图
'StopTrainingCriteria','AverageReward',... % 训练停止条件
'StopTrainingValue',0,... % 停止训练的平均奖励值
'ScoreAveragingWindowLength',10,... % 计算平均奖励的窗口长度
'SaveAgentCriteria',"EpisodeReward",... % 保存智能体的条件
'SaveAgentValue',0); % 保存智能体的奖励值
% %单智能体训练选项
% trainOpts = rlTrainingOptions(...
% Plots='training-progress',...
% MaxEpisodes=500,...
% MaxStepsPerEpisode=96,...
% ScoreAveragingWindowLength=10,...
% StopTrainingCriteria="AverageReward", ...
% StopTrainingValue=0);
% %"LearningStrategy","decentralized",...
% % 'Verbose',true, ...
% %训练智能体
result = train(agent1,env,trainOpts);
%% 测试
agent=agent_Trained;
%agent=agent1;
simSteps = 200;
simOptions = rlSimulationOptions('MaxSteps',simSteps);
experience = sim(env,agent,simOptions);
simActionSeries = experience.Action.ActionSac.Data;
%STEPFUNCTION
function [NextObs,Reward,IsDone,LoggedSignals] = sacStepFunction(Action,LoggedSignals)
Q_BS=500;
%读取三个综合能源系统分别一天内的电负荷需求
load('load_e.mat');
LOAD_EE = 0.6*transpose(load_e);
LOAD_EE1 = LOAD_EE(1:96);
LOAD_EE2 = LOAD_EE(97:192);
LOAD_EE3 = LOAD_EE(193:288);
%分时购电电价,扩展至96点
Power_B1=[0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.77 0.77 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 0.77 0.77 0.77 0.77 0.77];
Power_Buy=zeros(1,96);
for n=1:24
Power_Buy(4*n-3:4*n)=Power_B1(n);
end
%光伏出力,每15分钟
load('PV.mat');
PV1_96 = transpose(PV(:,1));
PV2_96 = transpose(PV(:,2));
PV3_96 = transpose(PV(:,3));
%%EB与CHP
load('PSO_data.mat')
CHP1 = PSO_data(:,1);
CHP2 = PSO_data(:,2);
CHP3 = PSO_data(:,3);
EB1 = PSO_data(:,4);
EB2 = PSO_data(:,5);
EB3 = PSO_data(:,6);
%指令时延(s),96*3;
delay11 = 2*ones(96,1);
delay12 = 2*ones(96,1);
delay13 = 2*ones(96,1);
%BS, EB, BUY ,CL,TR
%action_space = rlNumericSpec([5 1], 'LowerLimit', action_lowerlimits, 'UpperLimit', action_upperlimits);
% 当前状态值, T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3
State = LoggedSignals.State;
%调试信息
% disp(class(Action));
% disp(size(Action));
disp(Action);
% 逐个智能体状态更新
NextObs = zeros(size(State));
Reward = 0;
for agent_i = 1:1
T_solt = State(1,agent_i);
LOAD_E = State(2,agent_i);
Power_buy = State(3,agent_i);
PV = State(4,agent_i);
CHP = State(5,agent_i);
EB = State(6,agent_i);
delay = State(7,agent_i);
T_next = T_solt + 1;
LOAD_E_next = LOAD_EE1(T_next);
Power_buy_next = Power_Buy(T_next);
PV_next = PV1_96(T_next);
CHP_next = CHP1(T_next);
EB_next = EB1(T_next);
delay_next = delay11(T_next);
% 更新状态
NextObs(:, agent_i) = [T_next; LOAD_E_next; Power_buy_next; PV_next; CHP_next; EB_next; delay_next];
% 奖励值
%调控成本
% [BS EB CL]
LOAD_real= PV(1) + 0.9 * CHP -(EB/0.95 - Action(2)) + Action(3) + Action(1);
%平衡项
BUY = LOAD_E(1) - LOAD_real;
COST = (0.5 * abs(Action(1)) + 0.2 * abs(Action(2)) + Power_buy * BUY + 0.5 * abs(Action(3)) );
% 时延偏差成本
% if BUY<=500 && BUY>0
% Penalty_local = 0.5*BUY;
% elseif BUY>500 && BUY<=1000
% Penalty_local = 2*BUY;
% else
% Penalty_local = 5*BUY;
% end
%Penalty_local = LOAD_E(1) - PV(1) + Action(1) + Action(2) - Action(3) - Action(4) - 0.8 * CHP + EB / 0.95;%正的是用电
% if abs(Penalty_local)<=80
% Penalty_local = 1*abs(Penalty_local);
% elseif abs(Penalty_local)<=120
% Penalty_local = 2*abs(Penalty_local);
% elseif abs(Penalty_local)>120
% Penalty_local = 3*abs(Penalty_local);
% else
% Penalty_local = 0;
% end
%Penalty_local
% %全局功率平衡约束
% Penalty_global = 100*(sum(LOAD_E) -sum(PV) + sum(Action(1)) + sum(Action(2)) - sum(Action(3)) - sum(Action(4)) - sum(Action(5))) ;
% if Penalty_global>300 || Penalty_local<-300
% Penalty_global=100;
% else
% Penalty_global = 0;
% end
%功率交互约束
% Penalty_Pt = 100*sum(Action(5,:));
% Reward = -COST- Penalty_local;
Reward = -COST;
end
LoggedSignals.State = NextObs;
LoggedSignals.action=Action;
NextObs = mat2cell(NextObs, 7, 1);
%判断一轮学习是否结束
%T_next
IsDone=(T_next >= 96);
end
%RESET
%状态环境重置函数
function [InitialObservation, LoggedSignal] = sacResetFunction()%重置强化学习环境
%读取三个综合能源系统分别一天内的电负荷需求
load('load_e.mat');
LOAD_E = 0.6*transpose(load_e);
LOAD_E1 = LOAD_E(1:96);
LOAD_E2 = LOAD_E(97:192);
LOAD_E3 = LOAD_E(193:288);
%分时购电电价,扩展至96点
Power_B=[0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.77 0.77 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 0.77 0.77 0.77 0.77 0.77];
Power_buy=zeros(1,96);
for n=1:24
Power_buy(4*n-3:4*n)=Power_B(n);
end
%光伏出力,每15分钟
load('PV.mat');
PV_1 = transpose(PV(:,1));
PV_2 = transpose(PV(:,2));
PV_3 = transpose(PV(:,3));
%%指令时延(s),96*3;
delay1 = 2*ones(96,1);
delay2 = 2*ones(96,1);
delay3 = 2*ones(96,1);
%%EP与CHP
load('PSO_data.mat')
CHP1 = PSO_data(:,1);
CHP2 = PSO_data(:,2);
CHP3 = PSO_data(:,3);
EB1 = PSO_data(:,4);
EB2 = PSO_data(:,5);
EB3 = PSO_data(:,6);
%初始化步骤
T_solt = 1;
LOAD_E1 = LOAD_E1(1);
LOAD_E2 = LOAD_E2(1);
LOAD_E3 = LOAD_E3(1);
Power_buy = Power_buy(1);
PV_1 = PV_1(1);
PV_2 = PV_2(1);
PV_3 = PV_3(1);
delay1 = delay1(1);
delay2 = delay2(1);
delay3 = delay3(1);
CHP1 = CHP1(1);
CHP2 = CHP2(1);
CHP3 = CHP3(1);
EB1=EB1(1);
EB2=EB2(1);
EB3=EB3(1);
%重置三个智能体状态的初始化观测值
LoggedSignal.State(:,1) = [T_solt;LOAD_E1;Power_buy;PV_1;CHP1;EB1;delay1;];
% LoggedSignal.State(:,2) = [T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% LoggedSignal.State(:,3) = [T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% LoggedSignal.State=[T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% 将初始环境状态变量作为记录信号LoggedSignal返回
% InitialObservation = {LoggedSignal.Agent1State, LoggedSignal.Agent2State, LoggedSignal.Agent3State};
InitialObservation = {LoggedSignal.State(:,1)};
end

Answers (1)

halleyhit
halleyhit on 2 Sep 2024
不同的智能体就是适应不同问题的,如果一个智能体能应对所有问题,那么就不会有其他智能体了

Products


Release

R2024a

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!