我再使用强化学习工具箱编写SAC智能体进行训练时策略一直在上下限波动,没有很好的探索,而使用DDPG智能体和PPO智能体则是能够进行一些有效的探索,请问这是什么原因?
14 views (last 30 days)
Show older comments
%main
% 观测空间和动作空间定义
% numObs = 11; %观测空间维度
% numAct = 4;%动作空间维度
numObs1 = 7; %观测空间维度
numAct1 = 3;%动作空间维度
%BS, EB, ,CL
actLowerLimit = [-100 ;-200 ; -50];
actUpperLimit = [100 ; 200 ; 50];
obsInfo = rlNumericSpec([numObs1 1]);
obsInfo.Name = 'ObservationSac1';
% 连续动作空间
actInfo = rlNumericSpec([numAct1 1],...
'LowerLimit',actLowerLimit,...
'UpperLimit',actUpperLimit);
actInfo.Name = 'ActionSac';
% 创建强化学习环境
env = rlFunctionEnv(obsInfo, actInfo, 'sacStepFunction', 'sacResetFunction');
% 定义网络结构参数
criticLayerSizes = [64 32];
actorLayerSizes = [64 32];
% 第一个Critic网络,SAC网络是同时输入观测状态与动作,然后输出一个Q值,每个critic网络有两个输入层;
% 观测状态输入层
obsPath = [
featureInputLayer(numObs1, Name="obsPathInLyr")
fullyConnectedLayer(criticLayerSizes(1))
reluLayer
fullyConnectedLayer(criticLayerSizes(1),Name="obsout")
];
% 动作状态输入层
actPath = [
featureInputLayer(numAct1, Name="actPathInLyr")
fullyConnectedLayer(criticLayerSizes(1))
reluLayer
fullyConnectedLayer(criticLayerSizes(1),Name="actout")
];
% 合并路径
comPath = [
concatenationLayer(1,2,Name="cct")
fullyConnectedLayer(criticLayerSizes(2))
reluLayer
fullyConnectedLayer(1, Name="output")
];
%创建critic网络
criticNetwork = dlnetwork();
criticNetwork = addLayers(criticNetwork,obsPath);
criticNetwork = addLayers(criticNetwork,actPath);
criticNetwork = addLayers(criticNetwork,comPath);
criticNetwork = connectLayers(criticNetwork,"obsout","cct/in1");
criticNetwork = connectLayers(criticNetwork,"actout","cct/in2");
critic11 = rlQValueFunction(criticNetwork,obsInfo,actInfo, ...
ActionInputNames="actPathInLyr", ...
ObservationInputNames="obsPathInLyr");
critic12 = rlQValueFunction(criticNetwork,obsInfo,actInfo, ...
ActionInputNames="actPathInLyr", ...
ObservationInputNames="obsPathInLyr");
% 创建策略网络(Actor Network),SAC的策略网络输入为观测状态,输出为动作的均值与方差,网络结构为单一输入两输出
% 输入层
inPath = [
featureInputLayer( ...
numObs1, ...
Name="netOin")
reluLayer
fullyConnectedLayer( ...
actorLayerSizes(1), ...
Name="nethid")
reluLayer
fullyConnectedLayer( ...
actorLayerSizes(2), ...
Name="infc")
];
meanPath = [
% tanhLayer(Name="tanhMean");
fullyConnectedLayer(numAct1,Name="FCMean");
% scalingLayer(Name="scale", ...
% Scale=actUpperLimit),
];
sdevPath = [
reluLayer(Name="reluStdv");
fullyConnectedLayer(numAct1,Name="FCStdv");
softplusLayer(Name="splus")
];
actorNetwork = dlnetwork();
actorNetwork = addLayers(actorNetwork,inPath);
actorNetwork = addLayers(actorNetwork,meanPath);
actorNetwork = addLayers(actorNetwork,sdevPath);
% actorNetwork = connectLayers(actorNetwork,"infc","tanhMean/in");
% actorNetwork = connectLayers(actorNetwork,"infc","tanhStdv/in");
actorNetwork = connectLayers(actorNetwork,"infc","FCMean/in");
actorNetwork = connectLayers(actorNetwork,"infc","reluStdv/in");
% 创建随机策略表示(高斯采样)
actor1 = rlContinuousGaussianActor(actorNetwork, obsInfo, actInfo, ...
ActionMeanOutputNames="FCMean",...
ActionStandardDeviationOutputNames="splus",...
ObservationInputNames="netOin");
act = getAction(actor1,{rand(obsInfo.Dimension)});
dist = evaluate(actor1,{rand(obsInfo.Dimension)});
% 评估网络训练设置
criticOptions = rlOptimizerOptions( ...
Optimizer="adam", ...
LearnRate=1e-3,...
GradientThreshold=1, ...
L2RegularizationFactor=2e-4);
%动作网络训练设置
actorOptions = rlOptimizerOptions( ...
Optimizer="adam", ...
LearnRate=1e-3,...
GradientThreshold=1, ...
L2RegularizationFactor=1e-5);
% 定义SAC智能体选项
sacOptions = rlSACAgentOptions(...
'TargetSmoothFactor',1e-3,... % 目标网络平滑系数
'ExperienceBufferLength',5000,... % 经验缓冲区大小
'MiniBatchSize',256,... % 小批量大小
'DiscountFactor',0.99,... % 折扣因子
'SampleTime',1,... % 采样时间
'CriticOptimizerOptions', criticOptions,...
'ActorOptimizerOptions',actorOptions);
% 创建SAC智能体
agent1 = rlSACAgent(actor1,[critic11 critic12],sacOptions);
% 定义训练选项
trainOpts = rlTrainingOptions(...
'MaxEpisodes',500,... % 最大训练回合数
'MaxStepsPerEpisode',96,... % 每回合的最大步数
'Verbose',true,... % 不显示详细的训练信息
'Plots','training-progress',... % 显示训练进度图
'StopTrainingCriteria','AverageReward',... % 训练停止条件
'StopTrainingValue',0,... % 停止训练的平均奖励值
'ScoreAveragingWindowLength',10,... % 计算平均奖励的窗口长度
'SaveAgentCriteria',"EpisodeReward",... % 保存智能体的条件
'SaveAgentValue',0); % 保存智能体的奖励值
% %单智能体训练选项
% trainOpts = rlTrainingOptions(...
% Plots='training-progress',...
% MaxEpisodes=500,...
% MaxStepsPerEpisode=96,...
% ScoreAveragingWindowLength=10,...
% StopTrainingCriteria="AverageReward", ...
% StopTrainingValue=0);
% %"LearningStrategy","decentralized",...
% % 'Verbose',true, ...
% %训练智能体
result = train(agent1,env,trainOpts);
%% 测试
agent=agent_Trained;
%agent=agent1;
simSteps = 200;
simOptions = rlSimulationOptions('MaxSteps',simSteps);
experience = sim(env,agent,simOptions);
simActionSeries = experience.Action.ActionSac.Data;
%STEPFUNCTION
function [NextObs,Reward,IsDone,LoggedSignals] = sacStepFunction(Action,LoggedSignals)
Q_BS=500;
%读取三个综合能源系统分别一天内的电负荷需求
load('load_e.mat');
LOAD_EE = 0.6*transpose(load_e);
LOAD_EE1 = LOAD_EE(1:96);
LOAD_EE2 = LOAD_EE(97:192);
LOAD_EE3 = LOAD_EE(193:288);
%分时购电电价,扩展至96点
Power_B1=[0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.77 0.77 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 0.77 0.77 0.77 0.77 0.77];
Power_Buy=zeros(1,96);
for n=1:24
Power_Buy(4*n-3:4*n)=Power_B1(n);
end
%光伏出力,每15分钟
load('PV.mat');
PV1_96 = transpose(PV(:,1));
PV2_96 = transpose(PV(:,2));
PV3_96 = transpose(PV(:,3));
%%EB与CHP
load('PSO_data.mat')
CHP1 = PSO_data(:,1);
CHP2 = PSO_data(:,2);
CHP3 = PSO_data(:,3);
EB1 = PSO_data(:,4);
EB2 = PSO_data(:,5);
EB3 = PSO_data(:,6);
%指令时延(s),96*3;
delay11 = 2*ones(96,1);
delay12 = 2*ones(96,1);
delay13 = 2*ones(96,1);
%BS, EB, BUY ,CL,TR
%action_space = rlNumericSpec([5 1], 'LowerLimit', action_lowerlimits, 'UpperLimit', action_upperlimits);
% 当前状态值, T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3
State = LoggedSignals.State;
%调试信息
% disp(class(Action));
% disp(size(Action));
disp(Action);
% 逐个智能体状态更新
NextObs = zeros(size(State));
Reward = 0;
for agent_i = 1:1
T_solt = State(1,agent_i);
LOAD_E = State(2,agent_i);
Power_buy = State(3,agent_i);
PV = State(4,agent_i);
CHP = State(5,agent_i);
EB = State(6,agent_i);
delay = State(7,agent_i);
T_next = T_solt + 1;
LOAD_E_next = LOAD_EE1(T_next);
Power_buy_next = Power_Buy(T_next);
PV_next = PV1_96(T_next);
CHP_next = CHP1(T_next);
EB_next = EB1(T_next);
delay_next = delay11(T_next);
% 更新状态
NextObs(:, agent_i) = [T_next; LOAD_E_next; Power_buy_next; PV_next; CHP_next; EB_next; delay_next];
% 奖励值
%调控成本
% [BS EB CL]
LOAD_real= PV(1) + 0.9 * CHP -(EB/0.95 - Action(2)) + Action(3) + Action(1);
%平衡项
BUY = LOAD_E(1) - LOAD_real;
COST = (0.5 * abs(Action(1)) + 0.2 * abs(Action(2)) + Power_buy * BUY + 0.5 * abs(Action(3)) );
% 时延偏差成本
% if BUY<=500 && BUY>0
% Penalty_local = 0.5*BUY;
% elseif BUY>500 && BUY<=1000
% Penalty_local = 2*BUY;
% else
% Penalty_local = 5*BUY;
% end
%Penalty_local = LOAD_E(1) - PV(1) + Action(1) + Action(2) - Action(3) - Action(4) - 0.8 * CHP + EB / 0.95;%正的是用电
% if abs(Penalty_local)<=80
% Penalty_local = 1*abs(Penalty_local);
% elseif abs(Penalty_local)<=120
% Penalty_local = 2*abs(Penalty_local);
% elseif abs(Penalty_local)>120
% Penalty_local = 3*abs(Penalty_local);
% else
% Penalty_local = 0;
% end
%Penalty_local
% %全局功率平衡约束
% Penalty_global = 100*(sum(LOAD_E) -sum(PV) + sum(Action(1)) + sum(Action(2)) - sum(Action(3)) - sum(Action(4)) - sum(Action(5))) ;
% if Penalty_global>300 || Penalty_local<-300
% Penalty_global=100;
% else
% Penalty_global = 0;
% end
%功率交互约束
% Penalty_Pt = 100*sum(Action(5,:));
% Reward = -COST- Penalty_local;
Reward = -COST;
end
LoggedSignals.State = NextObs;
LoggedSignals.action=Action;
NextObs = mat2cell(NextObs, 7, 1);
%判断一轮学习是否结束
%T_next
IsDone=(T_next >= 96);
end
%RESET
%状态环境重置函数
function [InitialObservation, LoggedSignal] = sacResetFunction()%重置强化学习环境
%读取三个综合能源系统分别一天内的电负荷需求
load('load_e.mat');
LOAD_E = 0.6*transpose(load_e);
LOAD_E1 = LOAD_E(1:96);
LOAD_E2 = LOAD_E(97:192);
LOAD_E3 = LOAD_E(193:288);
%分时购电电价,扩展至96点
Power_B=[0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.77 0.77 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 0.77 0.77 0.77 0.77 0.77];
Power_buy=zeros(1,96);
for n=1:24
Power_buy(4*n-3:4*n)=Power_B(n);
end
%光伏出力,每15分钟
load('PV.mat');
PV_1 = transpose(PV(:,1));
PV_2 = transpose(PV(:,2));
PV_3 = transpose(PV(:,3));
%%指令时延(s),96*3;
delay1 = 2*ones(96,1);
delay2 = 2*ones(96,1);
delay3 = 2*ones(96,1);
%%EP与CHP
load('PSO_data.mat')
CHP1 = PSO_data(:,1);
CHP2 = PSO_data(:,2);
CHP3 = PSO_data(:,3);
EB1 = PSO_data(:,4);
EB2 = PSO_data(:,5);
EB3 = PSO_data(:,6);
%初始化步骤
T_solt = 1;
LOAD_E1 = LOAD_E1(1);
LOAD_E2 = LOAD_E2(1);
LOAD_E3 = LOAD_E3(1);
Power_buy = Power_buy(1);
PV_1 = PV_1(1);
PV_2 = PV_2(1);
PV_3 = PV_3(1);
delay1 = delay1(1);
delay2 = delay2(1);
delay3 = delay3(1);
CHP1 = CHP1(1);
CHP2 = CHP2(1);
CHP3 = CHP3(1);
EB1=EB1(1);
EB2=EB2(1);
EB3=EB3(1);
%重置三个智能体状态的初始化观测值
LoggedSignal.State(:,1) = [T_solt;LOAD_E1;Power_buy;PV_1;CHP1;EB1;delay1;];
% LoggedSignal.State(:,2) = [T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% LoggedSignal.State(:,3) = [T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% LoggedSignal.State=[T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% 将初始环境状态变量作为记录信号LoggedSignal返回
% InitialObservation = {LoggedSignal.Agent1State, LoggedSignal.Agent2State, LoggedSignal.Agent3State};
InitialObservation = {LoggedSignal.State(:,1)};
end
0 Comments
Answers (1)
See Also
Categories
Find more on Big Data Processing in Help Center and File Exchange
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!