Hi, everyone.
I try to teach model to play minesweeper game.
I’ve already tried different configurations, reward strategies, observation approaches, but there are no valuable results at all.
The best results for 15 million steps run are:
Mean rewards increase from -8f to -0.5f.
10-20% of all clicks are on revealed cells (frustrating).
About 6% of winnings.
Could anybody give me advice on what I’m doing wrong or what should I change?
The most “successful” try for now are:
Board size is 20x20.
Reward strategy:
I use dynamic strategy. The longer you live, the more rewards you will receive.
_step represents the count of cells revealed by the model during an episode. With each click on an unrevealed cell, _step increments by one. The counter resets at the start of a new episode.
Win: SetReward(1f)
Lose: SetReward(-1f)
Unrevealed cell is clicked: AddReward(0.1f + 0.005f * _step)
Revealed cell is clicked: AddReward(-0.3f + 0.005f * _step)
Mined cell is clicked: AddReward(-0.5f)
Observations:
Custom board sensor based on the Match3 example.
using System;
using System.Collections.Generic;
using Unity.MLAgents.Sensors;
using UnityEngine;
public class BoardSensor : ISensor, IDisposable
{
public BoardSensor(Game game, int channels)
{
_game = game;
_channels = channels;
_observationSpec = ObservationSpec.Visual(channels, game.height, game.width);
_texture = new Texture2D(game.width, game.height, TextureFormat.RGB24, false);
_textureUtils = new OneHotToTextureUtil(game.height, game.width);
}
private readonly Game _game;
private readonly int _channels;
private readonly ObservationSpec _observationSpec;
private Texture2D _texture;
private readonly OneHotToTextureUtil _textureUtils;
public ObservationSpec GetObservationSpec()
{
return _observationSpec;
}
public int Write(ObservationWriter writer)
{
int offset = 0;
int width = _game.width;
int height = _game.height;
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
{
for (var i = 0; i < _channels; I++)
{
writer[i, y, x] = GetChannelValue(_game.Grid[x, y], i);
offset++;
}
}
}
return offset;
}
private float GetChannelValue(Cell cell, int channel)
{
if (!cell.revealed)
return channel == 0 ? 1.0f : 0.0f;
if (cell.type == Cell.Type.Number)
return channel == cell.number ? 1.0f : 0.0f;
if (cell.type == Cell.Type.Empty)
return channel == 9 ? 1.0f : 0.0f;
if (cell.type == Cell.Type.Mine)
return channel == 10 ? 1.0f : 0.0f;
return 0.0f;
}
public byte[] GetCompressedObservation()
{
var allBytes = new List<byte>();
var numImages = (_channels + 2) / 3;
for (int i = 0; i < numImages; I++)
{
_textureUtils.EncodeToTexture(_game.Grid, _texture, 3 * i, _game.height, _game.width);
allBytes.AddRange(_texture.EncodeToPNG());
}
return allBytes.ToArray();
}
public void Update() { }
public void Reset() { }
public CompressionSpec GetCompressionSpec()
{
return new CompressionSpec(SensorCompressionType.PNG);
}
public string GetName()
{
return "BoardVisualSensor";
}
internal class OneHotToTextureUtil
{
Color32[] m_Colors;
int m_MaxHeight;
int m_MaxWidth;
private static Color32[] s_OneHotColors = { Color.red, Color.green, Color.blue };
public OneHotToTextureUtil(int maxHeight, int maxWidth)
{
m_Colors = new Color32[maxHeight * maxWidth];
m_MaxHeight = maxHeight;
m_MaxWidth = maxWidth;
}
public void EncodeToTexture(
CellGrid cells,
Texture2D texture,
int channelOffset,
int currentHeight,
int currentWidth
)
{
var i = 0;
for (var y = m_MaxHeight - 1; y >= 0; y--)
{
for (var x = 0; x < m_MaxWidth; x++)
{
Color32 colorVal = Color.black;
if (x < currentWidth && y < currentHeight)
{
int oneHotValue = GetHotValue(cells[x, y]);
if (oneHotValue >= channelOffset && oneHotValue < channelOffset + 3)
{
colorVal = s_OneHotColors[oneHotValue - channelOffset];
}
}
m_Colors[i++] = colorVal;
}
}
texture.SetPixels32(m_Colors);
}
private int GetHotValue(Cell cell)
{
if (!cell.revealed)
return 0;
if (cell.type == Cell.Type.Number)
return cell.number;
if (cell.type == Cell.Type.Empty)
return 9;
if (cell.type == Cell.Type.Mine)
return 10;
return 0;
}
}
public void Dispose()
{
if (!ReferenceEquals(null, _texture))
{
if (Application.isEditor)
{
// Edit Mode tests complain if we use Destroy()
UnityEngine.Object.DestroyImmediate(_texture);
}
else
{
UnityEngine.Object.Destroy(_texture);
}
_texture = null;
}
}
}
Yaml config file:
behaviors:
Minesweeper:
trainer_type: ppo
hyperparameters:
batch_size: 512
buffer_size: 12800
learning_rate: 0.0005
beta: 0.0175
epsilon: 0.25
lambd: 0.95
num_epoch: 3
learning_rate_schedule: linear
network_settings:
normalize: true
hidden_units: 256
num_layers: 4
vis_encode_type: match3
reward_signals:
extrinsic:
gamma: 0.95
strength: 1.0
keep_checkpoints: 5
max_steps: 15000000
time_horizon: 128
summary_freq: 10000
environment_parameters:
mines_amount:
curriculum:
- name: Lesson0
completion_criteria:
measure: reward
behavior: Minesweeper
signal_smoothing: true
min_lesson_length: 100
threshold: 0.1
value:
sampler_type: uniform
sampler_parameters:
min_value: 8.0
max_value: 13.0
- name: Lesson1
completion_criteria:
measure: reward
behavior: Minesweeper
signal_smoothing: true
min_lesson_length: 100
threshold: 0.5
value:
sampler_type: uniform
sampler_parameters:
min_value: 14.0
max_value: 19.0
- name: Lesson2
completion_criteria:
measure: reward
behavior: Minesweeper
signal_smoothing: true
min_lesson_length: 100
threshold: 0.7
value:
sampler_type: uniform
sampler_parameters:
min_value: 20.0
max_value: 25.0
- name: Lesson3
completion_criteria:
measure: reward
behavior: Minesweeper
signal_smoothing: true
min_lesson_length: 100
threshold: 0.85
value:
sampler_type: uniform
sampler_parameters:
min_value: 26.0
max_value: 31.0
- name: Lesson4
value: 32.0