Hello,
I was curious to find out what benefits SAC can bring to my projects over PPO so I’ve set up a simple scene where the agent is in a small level environment trying to fetch a target cube.
Agent script:
public class AgentCube_1 : Agent
{
public float agentRunSpeed = 3f;
public float spawnAreaMarginMultiplier = 0.9f;
public Transform Target;
public GameObject ground;
Bounds areaBounds;
Rigidbody m_AgentRb;
void Start()
{
m_AgentRb = GetComponent<Rigidbody>();
areaBounds = ground.GetComponent<Collider>().bounds;
}
public override void OnEpisodeBegin()
{
// Move the target to a new spot
Target.localPosition = GetRandomSpawnPos();
}
public override void OnActionReceived(ActionBuffers actionBuffers)
{
MoveAgent(actionBuffers);
// Give reward
float distanceToTarget = Vector3.Distance(this.transform.localPosition, Target.localPosition);
if (distanceToTarget < 1f)
{
SetReward(1.0f);
EndEpisode();
}
// Punish the agent if it is taking too long
SetReward(-5f / 1000f);
}
public override void CollectObservations(VectorSensor sensor)
{
// Target and Agent positions
sensor.AddObservation(Target.localPosition);
sensor.AddObservation(this.transform.localPosition);
}
public void MoveAgent(ActionBuffers actionBuffers)
{
var dirToGo = Vector3.zero;
var movementAction = actionBuffers.DiscreteActions[0];
switch (movementAction)
{
case 1:
dirToGo = transform.forward * 1f;
break;
case 2:
dirToGo = transform.forward * -1f;
break;
case 3:
dirToGo = transform.right * -0.75f;
break;
case 4:
dirToGo = transform.right * 0.75f;
break;
}
m_AgentRb.AddForce(dirToGo * agentRunSpeed,
ForceMode.VelocityChange);
}
public override void Heuristic(in ActionBuffers actionsOut)
{
// removed to reduce line amount
}
public Vector3 GetRandomSpawnPos()
{
var foundNewSpawnLocation = false;
var randomSpawnPos = Vector3.zero;
while (foundNewSpawnLocation == false)
{
var randomPosX = Random.Range(-areaBounds.extents.x * spawnAreaMarginMultiplier,
areaBounds.extents.x * spawnAreaMarginMultiplier);
var randomPosZ = Random.Range(-areaBounds.extents.z * spawnAreaMarginMultiplier,
areaBounds.extents.z * spawnAreaMarginMultiplier);
randomSpawnPos = ground.transform.localPosition + new Vector3(randomPosX, 0.5f, randomPosZ);
var worldspacepos = transform.TransformPoint(randomSpawnPos);
if (Physics.CheckBox(worldspacepos, new Vector3(0.5f, 0.1f, 0.5f)) == false)
{
foundNewSpawnLocation = true;
}
}
return randomSpawnPos;
}
}
Hyperparameters:
behaviors:
HiderAgent:
trainer_type: sac
hyperparameters:
learning_rate: 0.0003
learning_rate_schedule: constant
batch_size: 512
buffer_size: 51200
buffer_init_steps: 0
tau: 0.005
steps_per_update: 16
save_replay_buffer: false
init_entcoef: 0.05
reward_signal_steps_per_update: 16
network_settings:
normalize: false
hidden_units: 128
num_layers: 2
reward_signals:
extrinsic:
gamma: 0.99
strength: 1.0
max_steps: 200000
time_horizon: 64
summary_freq: 10000
Agent max steps: 1000 - this does not reset the position of the agent but changes the position of the target
Agent space type: Discrete
Agents in scene: 16
While PPO works great, when I try to use SAC the agent chooses a random direction and rams the outer wall indefinitely while racking up reward penalties and never picking up the target.
What could be the issue here?