Need help with a Minesweeper RL training issue involving a 2D grid

Hi, everyone.

I try to teach model to play minesweeper game.
I’ve already tried different configurations, reward strategies, observation approaches, but there are no valuable results at all.

The best results for 15 million steps run are:
Mean rewards increase from -8f to -0.5f.
10-20% of all clicks are on revealed cells (frustrating).
About 6% of winnings.

Could anybody give me advice on what I’m doing wrong or what should I change?

The most “successful” try for now are:

Board size is 20x20.

Reward strategy:
I use dynamic strategy. The longer you live, the more rewards you will receive.
_step represents the count of cells revealed by the model during an episode. With each click on an unrevealed cell, _step increments by one. The counter resets at the start of a new episode.
Win: SetReward(1f)
Lose: SetReward(-1f)
Unrevealed cell is clicked: AddReward(0.1f + 0.005f * _step)
Revealed cell is clicked: AddReward(-0.3f + 0.005f * _step)
Mined cell is clicked: AddReward(-0.5f)

Observations:
Custom board sensor based on the Match3 example.

using System;
using System.Collections.Generic;
using Unity.MLAgents.Sensors;
using UnityEngine;

public class BoardSensor : ISensor, IDisposable
{
    public BoardSensor(Game game, int channels)
    {
        _game = game;
        _channels = channels;

        _observationSpec = ObservationSpec.Visual(channels, game.height, game.width);
        _texture = new Texture2D(game.width, game.height, TextureFormat.RGB24, false);
        _textureUtils = new OneHotToTextureUtil(game.height, game.width);
    }

    private readonly Game _game;
    private readonly int _channels;
    private readonly ObservationSpec _observationSpec;
    private Texture2D _texture;
    private readonly OneHotToTextureUtil _textureUtils;

    public ObservationSpec GetObservationSpec()
    {
        return _observationSpec;
    }

    public int Write(ObservationWriter writer)
    {
        int offset = 0;
        int width = _game.width;
        int height = _game.height;

        for (int y = 0; y < height; y++)
        {
            for (int x = 0; x < width; x++)
            {
                for (var i = 0; i < _channels; I++)
                {
                    writer[i, y, x] = GetChannelValue(_game.Grid[x, y], i);
                    offset++;
                }
            }
        }
        
        return offset;
    }

    private float GetChannelValue(Cell cell, int channel)
    {
        if (!cell.revealed)
            return channel == 0 ? 1.0f : 0.0f;

        if (cell.type == Cell.Type.Number)
            return channel == cell.number ? 1.0f : 0.0f;

        if (cell.type == Cell.Type.Empty)
            return channel == 9 ? 1.0f : 0.0f;
        
        if (cell.type == Cell.Type.Mine)
            return channel == 10 ? 1.0f : 0.0f;
        
        return 0.0f;
    }

    public byte[] GetCompressedObservation()
    {
        var allBytes = new List<byte>();
        var numImages = (_channels + 2) / 3;
        for (int i = 0; i < numImages; I++)
        {
            _textureUtils.EncodeToTexture(_game.Grid, _texture, 3 * i, _game.height, _game.width);
            allBytes.AddRange(_texture.EncodeToPNG());
        }

        return allBytes.ToArray();
    }

    public void Update() { }

    public void Reset() { }

    public CompressionSpec GetCompressionSpec()
    {
        return new CompressionSpec(SensorCompressionType.PNG);
    }

    public string GetName()
    {
        return "BoardVisualSensor";
    }
    
    internal class OneHotToTextureUtil
    {
        Color32[] m_Colors;
        int m_MaxHeight;
        int m_MaxWidth;
        private static Color32[] s_OneHotColors = { Color.red, Color.green, Color.blue };

        public OneHotToTextureUtil(int maxHeight, int maxWidth)
        {
            m_Colors = new Color32[maxHeight * maxWidth];
            m_MaxHeight = maxHeight;
            m_MaxWidth = maxWidth;
        }

        public void EncodeToTexture(
            CellGrid cells,
            Texture2D texture,
            int channelOffset,
            int currentHeight,
            int currentWidth
        )
        {
            var i = 0;
            for (var y = m_MaxHeight - 1; y >= 0; y--)
            {
                for (var x = 0; x < m_MaxWidth; x++)
                {
                    Color32 colorVal = Color.black;
                    if (x < currentWidth && y < currentHeight)
                    {
                        int oneHotValue = GetHotValue(cells[x, y]);
                        if (oneHotValue >= channelOffset && oneHotValue < channelOffset + 3)
                        {
                            colorVal = s_OneHotColors[oneHotValue - channelOffset];
                        }
                    }
                    m_Colors[i++] = colorVal;
                }
            }
            texture.SetPixels32(m_Colors);
        }

        private int GetHotValue(Cell cell)
        {
            if (!cell.revealed)
                return 0;
            
            if (cell.type == Cell.Type.Number)
                return cell.number;

            if (cell.type == Cell.Type.Empty)
                return 9;

            if (cell.type == Cell.Type.Mine)
                return 10;
            
            return 0;
        }
    }

    public void Dispose()
    {
        if (!ReferenceEquals(null, _texture))
        {
            if (Application.isEditor)
            {
                // Edit Mode tests complain if we use Destroy()
                UnityEngine.Object.DestroyImmediate(_texture);
            }
            else
            {
                UnityEngine.Object.Destroy(_texture);
            }
            _texture = null;
        }
    }
}

Yaml config file:

behaviors:
  Minesweeper:
    trainer_type: ppo
    hyperparameters:
      batch_size: 512
      buffer_size: 12800
      learning_rate: 0.0005
      beta: 0.0175
      epsilon: 0.25
      lambd: 0.95
      num_epoch: 3
      learning_rate_schedule: linear

    network_settings:
      normalize: true
      hidden_units: 256
      num_layers: 4
      vis_encode_type: match3

    reward_signals:
      extrinsic:
        gamma: 0.95
        strength: 1.0

    keep_checkpoints: 5
    max_steps: 15000000
    time_horizon: 128
    summary_freq: 10000

environment_parameters:
  mines_amount:
    curriculum:
      - name: Lesson0
        completion_criteria:
          measure: reward
          behavior: Minesweeper
          signal_smoothing: true
          min_lesson_length: 100
          threshold: 0.1
        value:
          sampler_type: uniform
          sampler_parameters:
            min_value: 8.0
            max_value: 13.0
      - name: Lesson1
        completion_criteria:
          measure: reward
          behavior: Minesweeper
          signal_smoothing: true
          min_lesson_length: 100
          threshold: 0.5
        value:
          sampler_type: uniform
          sampler_parameters:
            min_value: 14.0
            max_value: 19.0
      - name: Lesson2
        completion_criteria:
          measure: reward
          behavior: Minesweeper
          signal_smoothing: true
          min_lesson_length: 100
          threshold: 0.7
        value:
          sampler_type: uniform
          sampler_parameters:
            min_value: 20.0
            max_value: 25.0
      - name: Lesson3
        completion_criteria:
          measure: reward
          behavior: Minesweeper
          signal_smoothing: true
          min_lesson_length: 100
          threshold: 0.85
        value:
          sampler_type: uniform
          sampler_parameters:
            min_value: 26.0
            max_value: 31.0
      - name: Lesson4
        value: 32.0