Unity ml-agents OnEpisodeBegin() error when multiple agents utilised

I have extended the GridWorld example for my game. The original agent environment contained a single agent, however, for my implementation, I am trying to include multiple agents that move around the environment committing crimes.

Normally, when the OnEpisodeBegin() method is called, the area.AreaReset() is invoked which re-sets the environment during training, however, as I have included several agents, when OnEpisodeBegin() is called multiple times, the first agent that is initialised works but the other agents remain stationary and the following error pops up:

MissingReferenceException: The object of type ‘GameObject’ has been destroyed but you are still trying to access it. Your script should either check if it is null or you should not destroy the object. AgentBehaviourScript.OnActionReceived (Unity.MLAgents.Actuators.ActionBuffers actionBuffers) (at Assets/AgentBehaviourScript.cs:125)

There is clearly an issue with the order of execution where one agent is calling area.AreaReset() prior to the other agents being spawned at a new location.

The model runs fine when area.AreaReset() isn’t called in OnEpisodeBegin() but then I can’t train my environment so any help would be greatly appreciated.

Here are both of my code snippets for GridArea.cs and AgentBehaviourScript.cs

GridWorld:

public void Start()
{
    m_ResetParams = Academy.Instance.EnvironmentParameters;

    m_Objects = new[] { rewardingBuilding, interventionObject, navigationNodes };

    actorObjs = new List<GameObject>();

    var sceneTransform = transform.Find("scene");

    m_Plane = sceneTransform.Find("Plane").gameObject;
    m_Sn = sceneTransform.Find("sN").gameObject;
    m_Ss = sceneTransform.Find("sS").gameObject;
    m_Sw = sceneTransform.Find("sW").gameObject;
    m_Se = sceneTransform.Find("sE").gameObject;
    m_ZoneA = sceneTransform.Find("ZoneA").gameObject;
    m_ZoneB = sceneTransform.Find("ZoneB").gameObject;
    m_ZoneC = sceneTransform.Find("ZoneC").gameObject;
    m_ZoneD = sceneTransform.Find("ZoneD").gameObject;

    interventions = new List<GameObject>();
    targets = new List<GameObject>();
    agents = GameObject.FindGameObjectsWithTag("agent");


    AreaReset();

    //m_InitialPosition = transform.position;
}

void SetEnvironment()
{
    // We should have playerList.Add(1) to 4 for secured buildings and 5 to 8 for rewarding buildings for each zone.*

    //transform.position = m_InitialPosition * (m_ResetParams.GetWithDefault("gridSize", gridsize) + 1);
    var playersListA = new List<int>();
    var playersListB = new List<int>();
    var playersListC = new List<int>();
    var playersListD = new List<int>();
    var navigationList = new List<int>();

    // TICK.
    objectsZoneA = add_objects_to_array("A", playersListA, numberOfInterventionsZoneA, numberOfRewardBuildingsZoneA).ToArray();
    objectsZoneB = add_objects_to_array("B", playersListB, numberOfInterventionsZoneB, numberOfRewardBuildingsZoneB).ToArray();
    objectsZoneC = add_objects_to_array("C", playersListC, numberOfInterventionsZoneC, numberOfRewardBuildingsZoneC).ToArray();
    objectsZoneD = add_objects_to_array("D", playersListD, numberOfInterventionsZoneD, numberOfRewardBuildingsZoneD).ToArray();

    navigationNodesList = add_navigation_to_array(navigationList, numberOfNavigationNodes).ToArray();

    set_up_env();

}

public void spawn_agent(GameObject agent){
  // Select a random index from the list of game objects.
  var random_nav_node = UnityEngine.Random.Range(0, nav_nodes.Length);
  // xA and yA are the local positions for x and z axis of the navigation node.
  var xA = nav_nodes[random_nav_node].transform.localPosition.x;
  var yA = nav_nodes[random_nav_node].transform.localPosition.z;

  // We initialise the offender agent at the same location as the navigation node.
  agent.transform.localPosition = new Vector3(xA, 0.34f, yA);
}


**public void AreaReset()**
{

    var gridSize = (int)m_ResetParams.GetWithDefault("gridSize", gridsize);


    emptyCells = new Dictionary<(int, int), bool>();

    foreach (var actor in actorObjs)
    {
        Destroy(actor);
    }

    SetEnvironment();

    interventions.Clear();
    targets.Clear();
    actorObjs.Clear();

    var numbersA = new HashSet<int>();
    var numbersB = new HashSet<int>();
    var numbersC = new HashSet<int>();
    var numbersD = new HashSet<int>();
    var navigationNodesHash = new HashSet<int>();


    while (navigationNodesHash.Count < navigationNodesList.Length + 1){

      navigationNodesHash.Add(UnityEngine.Random.Range(0, gridSize * gridSize));

    }

    while (numbersA.Count < objectsZoneA.Length + 1)
    {
        numbersA.Add(UnityEngine.Random.Range(0, gridSize * gridSize));
    }


    while (numbersB.Count < objectsZoneB.Length + 1)
    {
        numbersB.Add(UnityEngine.Random.Range(0, gridSize * gridSize));
    }

    while (numbersC.Count < objectsZoneC.Length + 1)
    {
        numbersC.Add(UnityEngine.Random.Range(0, gridSize * gridSize));
    }

    while (numbersD.Count < objectsZoneD.Length + 1)
    {
        numbersD.Add(UnityEngine.Random.Range(0, gridSize * gridSize));
    }

    var numbersA_ = numbersA.ToArray();
    var numbersB_ = numbersB.ToArray();
    var numbersC_ = numbersC.ToArray();
    var numbersD_ = numbersD.ToArray();

    object_location("A");
    object_location("B");
    object_location("C");
    object_location("D");
    object_location("navigation");

    agents = GameObject.FindGameObjectsWithTag("agent");
    nav_nodes = GameObject.FindGameObjectsWithTag("navigation_node");

    foreach(GameObject agent in agents){
      spawn_agent(agent);

    }
}
public List<int> add_objects_to_array(string zone, List<int> objectList, int secured, int rewarding){

    for (var i = 0; i < (int)m_ResetParams.GetWithDefault("numberOfInterventionsZone{zone}", secured); i++)
    {
        objectList.Add(1);
    }

    for (var i = 0; i < (int)m_ResetParams.GetWithDefault("numRewardingBuildings{zone}", rewarding); i++)
    {
        objectList.Add(0);
    }
    return objectList;
  }


  public List<int> add_navigation_to_array(List<int> objectList, int number_of_nodes){

    for (var i = 0; i < (int)m_ResetParams.GetWithDefault("numOfNavNodes", number_of_nodes); i++)
    {
        objectList.Add(2);
    }

    return objectList;
  }


  public void set_up_env(){
    var gridSize = (int)m_ResetParams.GetWithDefault("gridSize", gridsize);
    m_Plane.transform.localScale = new Vector3(gridSize / 10.0f, 1f, gridSize / 10.0f);
    m_Plane.transform.localPosition = new Vector3((gridSize - 1) / 2f, -0.5f, (gridSize - 1) / 2f);

    m_Sn.transform.localScale = new Vector3(1, 3, gridSize + 2);
    m_Ss.transform.localScale = new Vector3(1, 3, gridSize + 2);
    m_Sn.transform.localPosition = new Vector3((gridSize - 1) / 2f, 1f, gridSize);
    m_Ss.transform.localPosition = new Vector3((gridSize - 1) / 2f, 1f, -1);
    m_Se.transform.localScale = new Vector3(1, 3, gridSize + 2);
    m_Sw.transform.localScale = new Vector3(1, 3, gridSize + 2);
    m_Se.transform.localPosition = new Vector3(gridSize, 1f, (gridSize - 1) / 2f);
    m_Sw.transform.localPosition = new Vector3(-1, 1f, (gridSize - 1) / 2f);

    m_ZoneA.transform.localPosition = new Vector3((gridSize - 1) / 4f, -0.4f, (gridSize - 1) / 1.32f);
    m_ZoneA.transform.localScale = new Vector3(gridSize / 20f, 1f, gridSize / 20f);
    m_ZoneB.transform.localPosition = new Vector3((gridSize - 1) / 1.32f, -0.4f, (gridSize - 1) / 1.32f);
    m_ZoneB.transform.localScale = new Vector3(gridSize / 20f, 1f, gridSize / 20f);
    m_ZoneC.transform.localPosition = new Vector3((gridSize - 1) / 4f, -0.4f, (gridSize - 1) / 4f);
    m_ZoneC.transform.localScale = new Vector3(gridSize / 20f, 1f, gridSize / 20f);
    m_ZoneD.transform.localPosition = new Vector3((gridSize - 1) / 1.32f, -0.4f, (gridSize - 1) / 4f);
    m_ZoneD.transform.localScale = new Vector3(gridSize / 20f, 1f, gridSize / 20f);
  }

AgentBehaviourScript:

public override void Initialize()
{
    //m_ResetParams = Academy.Instance.EnvironmentParameters;

    //area.AreaReset();

    nav_nodes.AddRange(GameObject.FindGameObjectsWithTag("navigation_node"));

    //routine_activity_nodes_list = routine_activity_space(total_routine_activity_nodes, nav_nodes);

    target_risk = new Dictionary<GameObject, float>();// A dictionary for each target risk.
    target_effort_normalized = new Dictionary<GameObject, float>();// A dictionary for each target effort.
    target_reward = new Dictionary<GameObject, double>();// A dictionary that stores the rewards associated with each target.
    total_reward_measure = new Dictionary<GameObject, float>(); // A dictionary that contains the total reward measure calculated.

}



public GameObject[] routine_activity_space(int number_of_nodes, List <GameObject> current_nodes){

  GameObject [] nodes_for_offender = new GameObject[number_of_nodes];
  System.Random random = new System.Random();

  // Get the current navigation node the offender is on and add it to the list first.
  for(int i = 0; i < current_nodes.Count; i++){
    if(current_nodes[i].transform.position.x == this.transform.position.x && current_nodes[i].transform.position.z == this.transform.position.z){
      nodes_for_offender[0] = current_nodes[i];
      // We need to remove the node that was added to the nodes_for_offender array.
      // So that the agent does not randomly select the same node twice for its routine activity space.
      current_nodes.Remove(current_nodes[i]);
    }
  }

  for(int i = 1; i < number_of_nodes; i++){
    //RandomItems.Add(AllItems[random.Next(0, AllItems.Count + 1)]);
    nodes_for_offender[i] = current_nodes[random.Next(0, current_nodes.Count)];
  }
  for(int i = 0; i < nodes_for_offender.Length; i++){
    nodes_for_offender[i].GetComponent<Renderer>().material.color=Color.red;
  }

  return nodes_for_offender;
}


// The reward/penalty calculi for the agent.
public override void OnActionReceived(ActionBuffers actionBuffers)

{

  //GameObject [] routine_activity_nodes
    // The agent is penalised each frame it doesn't get to the goal. This will ensure
    // this enforces the behaviour to reach the goal, and can be converted into
    // a distance decay function. I.e. when agent leaves its home region buffer
    // the penalty starts to be inflicted.
    //AddReward(-0.01f);

    //Current pos

    // var number_of_nodes = routine_activity_nodes.Length;
    var targetPos = transform.position;
    // Movement logic.

    // get current position
    // if moving left right up or down decreases distance from current pos to target
    // move in that position.
    var right_move = targetPos + new Vector3(1f, 0, 0f);
    var left_move = targetPos + new Vector3(-1f, 0, 0f);
    var up_move = targetPos + new Vector3(0f, 0, 1f);
    var down_move = targetPos + new Vector3(0f, 0, -1f);

    // Select a random node from the routine activity space, this should not be 0 in the first instance
    // then

    //current_nav_node = UnityEngine.Random.Range(0, total_routine_activity_nodes);

    var selected_node = routine_activity_nodes_list[current_nav_node];

    **if(Vector3.Distance(right_move, selected_node.transform.position) <=** LINE 125
    Vector3.Distance(left_move, selected_node.transform.position) &&
    Vector3.Distance(right_move, selected_node.transform.position) <=
    Vector3.Distance(up_move, selected_node.transform.position) &&
    Vector3.Distance(right_move, selected_node.transform.position) <=
    Vector3.Distance(down_move, selected_node.transform.position)){

      targetPos = transform.position + new Vector3(1f, 0, 0f);
      //Debug.Log("RIGHT");


    }else if(Vector3.Distance(left_move, selected_node.transform.position) <=
    Vector3.Distance(right_move, selected_node.transform.position) &&
    Vector3.Distance(left_move, selected_node.transform.position) <=
    Vector3.Distance(up_move, selected_node.transform.position) &&
    Vector3.Distance(left_move, selected_node.transform.position) <=
    Vector3.Distance(down_move, selected_node.transform.position)){

      targetPos = transform.position + new Vector3(-1f, 0, 0f);
      //Debug.Log("LEFT");

    }else if(Vector3.Distance(up_move, selected_node.transform.position) <=
    Vector3.Distance(right_move, selected_node.transform.position) &&
    Vector3.Distance(up_move, selected_node.transform.position) <=
    Vector3.Distance(left_move, selected_node.transform.position) &&
    Vector3.Distance(up_move, selected_node.transform.position) <=
    Vector3.Distance(down_move, selected_node.transform.position)){

      targetPos = transform.position + new Vector3(0f, 0, 1f);
      //Debug.Log("UP");

    }else if (Vector3.Distance(down_move, selected_node.transform.position) <=
    Vector3.Distance(right_move, selected_node.transform.position) &&
    Vector3.Distance(down_move, selected_node.transform.position) <=
    Vector3.Distance(left_move, selected_node.transform.position) &&
    Vector3.Distance(down_move, selected_node.transform.position) <=
    Vector3.Distance(up_move, selected_node.transform.position)){

      targetPos = transform.position + new Vector3(0f, 0, -1f);
      //Debug.Log("DOWN");
    }


    var hit = Physics.OverlapBox(
        targetPos, new Vector3(0.3f, 0.3f, 0.3f));


         transform.position = targetPos;

         // var action = actionBuffers.DiscreteActions[0];
         // Debug.Log(("ACTION", action));
        // If it collides with a gameobject with the tag GOAL it should be rewarded and the episode ended.
        if (hit.Where(col => col.gameObject.CompareTag("rewarding_building_zone_A")).ToArray().Length == 1)
        {
            // AGENT'S PERSONAL REWARD CALCULI, MIN AND MAX THRESHOLD FOR EXPECTED OFFENCE REWARD OUTCOME. Calculus = reward[0, 1] - (Risk[0, 1] + Effort[0, 1]).
            //commit_offence(hit, 0f, 0.99f, actionBuffers.DiscreteActions[0]);
            commit_offence(hit, actionBuffers.DiscreteActions[0]);
            //Debug.Log("ZONEA");
            //SetReward(1f);
            //EndEpisode();
        }
        else if (hit.Where(col => col.gameObject.CompareTag("rewarding_building_zone_B")).ToArray().Length == 1)
        {
            //commit_offence(hit, 0f, 0.99f, actionBuffers.DiscreteActions[0]);
            commit_offence(hit, actionBuffers.DiscreteActions[0]);
            //SetReward(0.5f);
            //EndEpisode();
        }
        else if (hit.Where(col => col.gameObject.CompareTag("rewarding_building_zone_C")).ToArray().Length == 1)
        {
            //commit_offence(hit, 0f, 0.99f, actionBuffers.DiscreteActions[0]);
            commit_offence(hit, actionBuffers.DiscreteActions[0]);
            //SetReward(0.1f);
            //EndEpisode();
        }
        else if (hit.Where(col => col.gameObject.CompareTag("rewarding_building_zone_D")).ToArray().Length == 1)
        {
            //commit_offence(hit, 0f, 0.99f, actionBuffers.DiscreteActions[0]);
            commit_offence(hit, actionBuffers.DiscreteActions[0]);
            //SetReward(0.1f);
            //EndEpisode();
        }
        else if (hit.Where(col => col.gameObject.CompareTag("neautral_building")).ToArray().Length == 1)
        {
              //Debug.Log("NEUTRAL_BUILDING");
            //SetReward(0f);
            //EndEpisode();
        }

    // Once you arrive at a destination routine activity node, pick a random node to travel to.
    if(transform.position.x == selected_node.transform.position.x &&
    transform.position.z == selected_node.transform.position.z){
      var placeholder = current_nav_node;
      current_nav_node = GetRandom(0, total_routine_activity_nodes, placeholder);
    }


}

// This method checks to see if the rewards from the current target is within the threshold
// whereby the offender will commit an offence, and either be rewarded, penalised or neutral.
// if the rewards are within the threshold i.e. between max and min threshold, the offender is rewarded,
// if the rewards are below 0 i.e. negative then the offender is penalised, or offender recieves 0.
public void commit_offence(Collider[] collided_objects, int action){

  // action = 0 commit offence.
  // action = 1 don't commit offence.

  foreach(Collider col in collided_objects){

    if(total_reward_measure.ContainsKey(col.gameObject)){

        // If the total_reward from target is within the bounds of the requirements of the offender >= min and <= max
        // and the offender decides to commit an offence here, they are rewarded. This is a good outcome for the offender.
        if(action == 0){
            Debug.Log((total_reward_measure[col.gameObject], "COMMIT OFFENCE", this.GetInstanceID()));
            SetReward(total_reward_measure[col.gameObject]);
            offence_committed++;
            // Keep track of cumulative reward.
            total_reward = total_reward + total_reward_measure[col.gameObject];
        }// }else if(action == 0 && (total_reward_measure[col.gameObject] < 0f)){
        //     Debug.Log((total_reward_measure[col.gameObject], min_threshold, max_threshold, "COMMIT OFFENCE BAD"));
        //     SetReward(-1f);
        else if(action == 1){
        // If the agent commits an offence at a target that has a reward < 0, then the offender
        // is penalised with that negative reward as this is not a good outcome for the offender.
        Debug.Log("DON'T COMMIT OFFENCE");
        SetReward(0f);
      }
    }
  }
}


public int GetRandom (int min, int max, int last_number)
{
    int rand = UnityEngine.Random.Range (min, max);
    while (rand == last_number)
        rand = UnityEngine.Random.Range (min, max);
    last_number = rand;
    return rand;
}


public override void Heuristic(in ActionBuffers actionsOut)
{
      var discreteActionsOut = actionsOut.DiscreteActions;

      if (Input.GetKey(KeyCode.A))
      {
          discreteActionsOut[0] = k_commit_offence;
      }
      if (Input.GetKey(KeyCode.D))
      {
          discreteActionsOut[0] = k_dont_commit_offence;
      }
      //OnActionReceived(routine_activity_nodes_list);
}

// to be implemented by the developer
public override void OnEpisodeBegin()
{
    // Reset agent parameters.

    **area.AreaReset();**

    nav_nodes.Clear();

    nav_nodes.AddRange(GameObject.FindGameObjectsWithTag("navigation_node"));

    //spawn_agent();

    // A list of the routine_activity_nodes chosen for the agent.
    routine_activity_nodes_list = routine_activity_space(total_routine_activity_nodes, nav_nodes);

    target_risk = new Dictionary<GameObject, float>();// A dictionary for each target risk.
    target_effort_normalized = new Dictionary<GameObject, float>();// A dictionary for each target effort.
    target_reward = new Dictionary<GameObject, double>();// A dictionary that stores the rewards associated with each target.
    total_reward_measure = new Dictionary<GameObject, float>(); // A dictionary that contains the total reward measure calculated.

    target_risk.Clear();
    target_effort_normalized.Clear();
    target_reward.Clear();
    total_reward_measure.Clear();

    // Adds the risk given the number of interventions around each target to a dictionary.
    calculate_risk_factor(area);

    // IF you want to see the calculated risk for each target uncomment.
    //get_dictionary_values(target_risk);

    // Gets the distance for each target to the agent and adds it to a dictionary.
    calculate_effort(area);

    // Calculates the normalised effort for each target.
    target_effort_normalized = get_normalized_effort(target_effort_normalized);

    // IF you want to see the normalized effort for each target, uncomment.
    //get_dictionary_values(target_effort_normalized);

    get_reward_for_target(area);

    calculate_reward_measure(area);


    //get_dictionary_values(total_reward_measure);
}

Thank you ever so much!

I'll kick this over to the team for some insight!

Hi @olmez49 !

The error you're seeing is because the environment reset is being called multiple times and agents are trying to destroy the objects that were already destroyed by other agents. We're aware that this API is not the best with multi-agent training and we've added more support (MultiAgentGroup, POCA trainer) in recent release. The MultiAgentGroup provides a way to get access to all agents in the environment so that you can synchronize the episodes for all agent, split the rewards or other group-wise operations.

Another alternative would be to create an environment controller class which have access to the information in the environment and call reset once when episode is over.

It seems that training your agent with your own custom arenas/environment leads to the arena resetting multiple times, with an arena in the yaml file being skipped. How to avoid this?