I am using the Python API to train a car agent using DDPG to go round a track using ml agents.
Scenario: After stepping the environment with a new action, for each agent, and calling env.get_steps(), 1 of the 6 agents topples over and hence calls EndEpisode(). This results in an empty DecisionSteps and 1 value in the TerminalSteps returned by env.get_steps().
Why is DecisionSteps empty when I still have 5 more agents that need an action? I require the observations found in DecisionSteps in order to be able to train the DDPG.
Hope this is clear.
# Get the default brain
brain_name = list(env.behavior_specs)[0]
# Assign the default brain as the brain to be controlled
brain = env.behavior_specs[brain_name]
action_size = len(brain.action_spec)
state_size = 119
num_agents = 6
agent = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=0)
# loop from num_episodes
for i_episode in range(1, num_episodes+1):
dones = np.array([False, False, False, False, False, False])
# reset the unity environment at the beginning of each episode
env_info = env.reset()
decision_steps, terminal_steps = env.get_steps(brain_name)
# get initial state of the unity environment
states = decision_steps.obs[0]
# reset the training agent for new episode
agent.reset()
# set the initial episode score to zero.
agent_scores = np.zeros(num_agents)
# Run the episode training loop;
# At each loop step take an action as a function of the current state observations
# Based on the resultant environmental state (next_state) and reward received update the Agents Actor and Critic networks
# If environment episode is done, exit loop...
# Otherwise repeat until done == true
while True:
# determine actions for the unity agents from current sate
actions = agent.act(states)
print(actions)
actionAT = ActionTuple()
actionAT.add_continuous(continuous=actions)
env.set_actions(brain_name, actionAT)
# Move the simulation forward
env.step()
# Get the new simulation results
decision_steps, terminal_steps = env.get_steps(brain_name)
next_states = decision_steps.obs[0] # get the next states for each unity agent in the environment
rewards = decision_steps.reward # get the rewards for each unity agent in the environment
# see if episode has finished for each unity agent in the environment
if(len(terminal_steps)!= 0):
for i in range(num_agents):
if(i in terminal_steps.agent_id):
dones[i] = True
#Send (S, A, R, S') info to the training agent for replay buffer (memory) and network updates
agent.step(states, actions, rewards, next_states, dones)
# set new states to current states for determining next actions
states = next_states
# Update episode score for each unity agent
agent_scores += rewards
# If any unity agent indicates that the episode is done,
# then exit episode loop, to begin new episode
if np.all(dones):
break
Thank you for any help