First commit

2026-03-23 21:19:29 +01:00
commit 29fc731e6c
7 changed files with 1173 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__/
+.ipynb_checkpoints/
+*.pth
+
--- a/README.md
+++ b/README.md
@@ -0,0 +1,60 @@
+# Ultimate Tic Tac Toe Deep Learning Bot
+
+**Usage**
+Run `python run.py --help` for help.
+
+**Shared flags**
+- --device: Torch device string. If omitted, it auto-picks "cuda" when available, otherwise "cpu".
+- --checkpoint: Path to the model checkpoint file. Default is latest.pth. It is loaded for eval, play, and checkpoint-based arena,
+and used as the save path for accepted training checkpoints.
+
+**Training parameters**
+- --resume: Loads model and optimizer state from --checkpoint before continuing training.
+- --num-simulations default 100: MCTS rollouts per move during self-play. Higher is stronger/slower.
+- --num-iters default 50: Number of outer training iterations. Each iteration generates new self-play games, trains, then arena-tests
+    the new model.
+- --num-eps default 20: Self-play games per iteration.
+- --epochs default 5: Passes over the current replay-buffer training set per iteration.
+- --batch-size default 64: Mini-batch size for gradient updates.
+- --lr default 5e-4: Adam learning rate.
+- --weight-decay default 1e-4: Adam weight decay (L2-style regularization).
+- --replay-buffer-size default 50000: Maximum number of training examples retained across iterations. Older examples are dropped.
+- --value-loss-weight default 1.0: Multiplier on the value-head loss in total training loss. Total loss is policy_KL +
+    value_loss_weight * value_loss.
+- --grad-clip-norm default 5.0: Global gradient norm clipping threshold before optimizer step.
+- --temperature-threshold default 10: In self-play, moves before this step use stochastic sampling from MCTS visit counts; later
+    moves use greedy selection.
+- --root-dirichlet-alpha default 0.3: Dirichlet noise alpha added to root priors during self-play MCTS to force exploration.
+- --root-exploration-fraction default 0.25: How much of that root prior is replaced by Dirichlet noise.
+- --arena-compare-games default 6: Number of head-to-head games between candidate and previous model after each iteration. If <= 0,
+    every candidate is accepted.
+- --arena-accept-threshold default 0.55: Minimum average points needed in arena to keep the new model. Win = 1, draw = 0.5.
+- --arena-compare-simulations default 8: MCTS simulations per move during those arena comparison games. Separate from self-play
+    --num-simulations. 
+
+**Evaluation parameters**
+
+- --moves default "": Comma-separated move list to reach a position from the starting board, e.g. 0,10,4.
+- --top-k default 10: How many highest-probability legal moves to print from the model policy.
+- --with-mcts: Also run MCTS on that position and print the best move, instead of only raw network policy/value.
+- --num-simulations default 100: Only matters with --with-mcts; controls MCTS search depth for that evaluation.
+
+**Play parameters**
+
+- --human-player default 1: Which side you control. 1 means X, -1 means O.
+- --num-simulations default 100: MCTS simulations the AI uses for each move.
+
+**Arena parameters**
+- --games default 20: Number of matches to run.
+- --num-simulations default 100: MCTS simulations per move for checkpoint-based players. 
+- --x-player / --o-player: Either checkpoint or random. Chooses the agent type for each side.
+- --x-checkpoint / --o-checkpoint: Checkpoint path for that side when its player type is checkpoint. Ignored for random.
+
+A few practical examples:
+
+```bash
+python run.py train --num-iters 100 --num-eps 50 --resume
+python run.py eval --checkpoint latest.pth --moves 0,10,4 --with-mcts --num-simulations 200
+python run.py play --human-player -1 --num-simulations 300
+python run.py arena --games 50 --x-player checkpoint --o-player random
+```
--- a/game.py
+++ b/game.py
@@ -0,0 +1,196 @@
+import numpy as np
+
+WIN_PATTERNS = [
+    (0, 1, 2),
+    (3, 4, 5),
+    (6, 7, 8),
+    (0, 3, 6),
+    (1, 4, 7),
+    (2, 5, 8),
+    (0, 4, 8),
+    (2, 4, 6),
+]
+
+class UltimateTicTacToe:
+    """
+    A very, very simple game of ConnectX in which we have:
+        rows: 1
+        columns: 4
+        winNumber: 2
+    """
+
+    def __init__(self):
+        self.cells = 81
+        self.board_width = 9
+        self.state_planes = 9
+
+    def get_init_board(self):
+        b = np.zeros((self.cells,), dtype=int)
+        return (b, None)
+
+    def get_board_size(self):
+        return (self.state_planes, self.board_width, self.board_width)
+
+    def get_action_size(self):
+        return self.cells
+
+    def get_next_state(self, board, player, action, verify_move=False):
+        if verify_move:
+            if self.get_valid_moves(board)[action] == 0:
+                return False
+        new_board_data = np.copy(board[0])
+        new_board_data[action] = player
+
+        next_board = ((action // 9) % 3) * 3 + (action % 3)
+        next_board = next_board if not self.is_board_full(new_board_data, next_board) else None
+
+        # Return the new game, but
+        # change the perspective of the game with negative
+        return ((new_board_data, next_board), -player)
+
+    def is_board_full(self, board_data, next_board):
+        return self._is_small_board_win(board_data, next_board, 1) or self._is_small_board_win(board_data, next_board, -1) or self._is_board_full(board_data, next_board) 
+
+    def _small_board_cells(self, inner_board_idx):
+        row_block = inner_board_idx // 3
+        col_block = inner_board_idx % 3
+
+        base = row_block * 27 + col_block * 3
+
+        return [
+            base, base + 1, base + 2,
+            base + 9, base + 10, base + 11,
+            base + 18, base + 19, base + 20
+        ]
+
+    def _is_board_full(self, board_data, next_board):
+        # Check if it is literally full
+        cells = self._small_board_cells(next_board)
+
+        for a in cells:
+            if board_data[a] == 0:
+                return False
+        return True
+
+    def _is_playable_small_board(self, board_data, inner_board_idx):
+        return not self.is_board_full(board_data, inner_board_idx)
+    
+    def has_legal_moves(self, board):
+        valid_moves = self.get_valid_moves(board)
+        for i in valid_moves:
+            if i == 1:
+                return True
+        return False
+
+    def get_valid_moves(self, board):
+        # All moves are invalid by default
+        board_data, active_board = board
+        valid_moves = [0] * self.get_action_size()
+
+        if active_board is not None and not self._is_playable_small_board(board_data, active_board):
+            active_board = None
+
+        if active_board is None:
+            playable_boards = [
+                inner_board_idx
+                for inner_board_idx in range(9)
+                if self._is_playable_small_board(board_data, inner_board_idx)
+            ]
+            for inner_board_idx in playable_boards:
+                for index in self._small_board_cells(inner_board_idx):
+                    if board_data[index] == 0:
+                        valid_moves[index] = 1
+        else:
+            for index in self._small_board_cells(active_board):
+                if board_data[index] == 0:
+                    valid_moves[index] = 1
+
+        return valid_moves
+
+    def _is_small_board_win(self, board_data, inner_board_idx, player):
+        cells = self._small_board_cells(inner_board_idx)
+    
+        for a, b, c in WIN_PATTERNS:
+            if board_data[cells[a]] == board_data[cells[b]] == board_data[cells[c]] == player:
+                return True
+        
+        return False
+    
+    def is_win(self, board, player):
+        board_data, _ = board
+        won = [self._is_small_board_win(board_data, i, player) for i in range(9)]
+        
+        # Check if any winning combination is all 1s
+        for a, b, c in WIN_PATTERNS:
+            if won[a] and won[b] and won[c]:
+                return True
+        
+        return False
+
+    def get_reward_for_player(self, board, player):
+        # return None if not ended, 1 if player 1 wins, -1 if player 1 lost
+
+        if self.is_win(board, player):
+            return 1
+        if self.is_win(board, -player):
+            return -1
+        if self.has_legal_moves(board):
+            return None
+
+        return 0
+
+    def get_canonical_board_data(self, board_data, player):
+        return player * board_data
+
+    def _small_board_mask(self, inner_board_idx):
+        mask = np.zeros((self.board_width, self.board_width), dtype=np.float32)
+        for index in self._small_board_cells(inner_board_idx):
+            row = index // self.board_width
+            col = index % self.board_width
+            mask[row, col] = 1.0
+        return mask
+
+    def encode_state(self, board):
+        board_data, active_board = board
+        board_grid = board_data.reshape(self.board_width, self.board_width)
+
+        current_stones = (board_grid == 1).astype(np.float32)
+        opponent_stones = (board_grid == -1).astype(np.float32)
+        empty_cells = (board_grid == 0).astype(np.float32)
+        legal_moves = np.array(self.get_valid_moves(board), dtype=np.float32).reshape(self.board_width, self.board_width)
+
+        active_board_mask = np.zeros((self.board_width, self.board_width), dtype=np.float32)
+        if active_board is not None and self._is_playable_small_board(board_data, active_board):
+            active_board_mask = self._small_board_mask(active_board)
+
+        current_won_boards = np.zeros((self.board_width, self.board_width), dtype=np.float32)
+        opponent_won_boards = np.zeros((self.board_width, self.board_width), dtype=np.float32)
+        playable_boards = np.zeros((self.board_width, self.board_width), dtype=np.float32)
+
+        for inner_board_idx in range(9):
+            board_mask = self._small_board_mask(inner_board_idx)
+            if self._is_small_board_win(board_data, inner_board_idx, 1):
+                current_won_boards += board_mask
+            elif self._is_small_board_win(board_data, inner_board_idx, -1):
+                opponent_won_boards += board_mask
+
+            if self._is_playable_small_board(board_data, inner_board_idx):
+                playable_boards += board_mask
+
+        move_count = np.count_nonzero(board_data) / self.cells
+        move_count_plane = np.full((self.board_width, self.board_width), move_count, dtype=np.float32)
+
+        return np.stack(
+            (
+                current_stones,
+                opponent_stones,
+                empty_cells,
+                legal_moves,
+                active_board_mask,
+                current_won_boards,
+                opponent_won_boards,
+                playable_boards,
+                move_count_plane,
+            ),
+            axis=0,
+        )
--- a/mcts.py
+++ b/mcts.py
@@ -0,0 +1,190 @@
+import torch
+import math
+import numpy as np
+
+
+def ucb_score(parent, child):
+    """
+    The score for an action that would transition between the parent and child.
+    """
+    prior_score = child.prior * math.sqrt(parent.visit_count) / (child.visit_count + 1)
+    if child.visit_count > 0:
+        # The value of the child is from the perspective of the opposing player
+        value_score = -child.value()
+    else:
+        value_score = 0
+
+    return value_score + prior_score
+
+
+class Node:
+    def __init__(self, prior, to_play):
+        self.visit_count = 0
+        self.to_play = to_play
+        self.prior = prior
+        self.value_sum = 0
+        self.children = {}
+        self.state = None
+
+    def expanded(self):
+        return len(self.children) > 0
+
+    def value(self):
+        if self.visit_count == 0:
+            return 0
+        return self.value_sum / self.visit_count
+
+    def select_action(self, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        """
+        visit_counts = np.array([child.visit_count for child in self.children.values()])
+        actions = [action for action in self.children.keys()]
+        if temperature == 0:
+            action = actions[np.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = np.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(visit_count_distribution)
+            action = np.random.choice(actions, p=visit_count_distribution)
+
+        return action
+
+    def select_child(self):
+        """
+        Select the child with the highest UCB score.
+        """
+        best_score = -np.inf
+        best_action = -1
+        best_child = None
+
+        for action, child in self.children.items():
+            score = ucb_score(self, child)
+            if score > best_score:
+                best_score = score
+                best_action = action
+                best_child = child
+
+        return best_action, best_child
+
+    def expand(self, state, to_play, action_probs):
+        """
+        We expand a node and keep track of the prior policy probability given by neural network
+        """
+        self.to_play = to_play
+        self.state = state
+        for a, prob in enumerate(action_probs):
+            if prob != 0:
+                self.children[a] = Node(prior=prob, to_play=self.to_play * -1)
+
+    def __repr__(self):
+        """
+        Debugger pretty print node info
+        """
+        prior = "{0:.2f}".format(self.prior)
+        return "{} Prior: {} Count: {} Value: {}".format(self.state.__str__(), prior, self.visit_count, self.value())
+
+
+class MCTS:
+
+    def __init__(self, game, model, args):
+        self.game = game
+        self.model = model
+        self.args = args
+
+    def _masked_policy(self, state, model):
+        encoded_state = self.game.encode_state(state)
+        action_probs, value = model.predict(encoded_state)
+        valid_moves = np.array(self.game.get_valid_moves(state), dtype=np.float32)
+        action_probs = action_probs * valid_moves
+        total_prob = np.sum(action_probs)
+        total_valid = np.sum(valid_moves)
+        if total_valid <= 0:
+            return valid_moves, float(value)
+        if total_prob <= 0:
+            action_probs = valid_moves / total_valid
+        else:
+            action_probs /= total_prob
+        return action_probs, float(value)
+
+    def _add_exploration_noise(self, node):
+        alpha = self.args.get('root_dirichlet_alpha')
+        fraction = self.args.get('root_exploration_fraction')
+        if alpha is None or fraction is None or not node.children:
+            return
+
+        actions = list(node.children.keys())
+        noise = np.random.dirichlet([alpha] * len(actions))
+        for action, sample in zip(actions, noise):
+            child = node.children[action]
+            child.prior = child.prior * (1 - fraction) + sample * fraction
+
+    def run(self, model, state, to_play):
+        model = model or self.model
+
+        root = Node(0, to_play)
+        root.state = state
+
+        reward = self.game.get_reward_for_player(state, player=1)
+        if reward is not None:
+            root.value_sum = float(reward)
+            root.visit_count = 1
+            return root
+
+        # EXPAND root
+        action_probs, value = self._masked_policy(state, model)
+        root.expand(state, to_play, action_probs)
+        if not root.children:
+            root.value_sum = float(value)
+            root.visit_count = 1
+            return root
+        self._add_exploration_noise(root)
+
+        for _ in range(self.args['num_simulations']):
+            node = root
+            search_path = [node]
+
+            # SELECT
+            xp = False
+            while node.expanded():
+                action, node = node.select_child()
+                if node == None:
+                    parent = search_path[-1]
+                    self.backpropagate(search_path, value, parent.to_play * -1)
+                    xp = True
+                    break
+                search_path.append(node)
+            if xp:
+                continue
+            parent = search_path[-2]
+            state = parent.state
+            # Now we're at a leaf node and we would like to expand
+            # Players always play from their own perspective
+            next_state, _ = self.game.get_next_state(state, player=1, action=action)
+            # Get the board from the perspective of the other player
+            next_state_data, next_state_inner = next_state
+            
+            next_state = (self.game.get_canonical_board_data(next_state_data, player=-1), next_state_inner)
+
+            # The value of the new state from the perspective of the other player
+            value = self.game.get_reward_for_player(next_state, player=1)
+            if value is None:
+                # If the game has not ended:
+                # EXPAND
+                action_probs, value = self._masked_policy(next_state, model)
+                node.expand(next_state, parent.to_play * -1, action_probs)
+
+            self.backpropagate(search_path, float(value), parent.to_play * -1)
+
+        return root
+
+    def backpropagate(self, search_path, value, to_play):
+        """
+        At the end of a simulation, we propagate the evaluation all the way up the tree
+        to the root.
+        """
+        for node in reversed(search_path):
+            node.value_sum += value if node.to_play == to_play else -value
+            node.visit_count += 1
--- a/model.py
+++ b/model.py
@@ -0,0 +1,81 @@
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(channels)
+        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(channels)
+
+    def forward(self, x):
+        residual = x
+        x = F.relu(self.bn1(self.conv1(x)))
+        x = self.bn2(self.conv2(x))
+        return F.relu(x + residual)
+
+
+class UltimateTicTacToeModel(nn.Module):
+    def __init__(self, board_size, action_size, device, channels=64, num_blocks=6):
+        super().__init__()
+
+        self.action_size = action_size
+        self.input_shape = board_size
+        self.input_channels = board_size[0]
+        self.board_height = board_size[1]
+        self.board_width = board_size[2]
+        self.device = torch.device(device)
+
+        self.stem = nn.Sequential(
+            nn.Conv2d(self.input_channels, channels, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(channels),
+            nn.ReLU(inplace=True),
+        )
+        self.residual_tower = nn.Sequential(*(ResidualBlock(channels) for _ in range(num_blocks)))
+
+        self.policy_head = nn.Sequential(
+            nn.Conv2d(channels, 32, kernel_size=1, bias=False),
+            nn.BatchNorm2d(32),
+            nn.ReLU(inplace=True),
+        )
+        self.policy_fc = nn.Linear(32 * self.board_height * self.board_width, self.action_size)
+
+        self.value_head = nn.Sequential(
+            nn.Conv2d(channels, 32, kernel_size=1, bias=False),
+            nn.BatchNorm2d(32),
+            nn.ReLU(inplace=True),
+        )
+        self.value_fc1 = nn.Linear(32 * self.board_height * self.board_width, 128)
+        self.value_fc2 = nn.Linear(128, 1)
+
+        self.to(self.device)
+
+    def forward(self, x):
+        x = x.view(-1, *self.input_shape)
+        x = self.stem(x)
+        x = self.residual_tower(x)
+
+        policy = self.policy_head(x)
+        policy = torch.flatten(policy, 1)
+        policy = self.policy_fc(policy)
+
+        value = self.value_head(x)
+        value = torch.flatten(value, 1)
+        value = F.relu(self.value_fc1(value))
+        value = torch.tanh(self.value_fc2(value))
+
+        return F.softmax(policy, dim=1), value
+
+    def predict(self, board):
+        board = torch.as_tensor(board, dtype=torch.float32, device=self.device)
+        board = board.view(1, *self.input_shape)
+        self.eval()
+        with torch.no_grad():
+            pi, v = self.forward(board)
+
+        return pi.detach().cpu().numpy()[0], float(v.item())
--- a/run.py
+++ b/run.py
@@ -0,0 +1,384 @@
+import argparse
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from game import UltimateTicTacToe
+from mcts import MCTS
+from model import UltimateTicTacToeModel
+from trainer import Trainer
+
+
+DEFAULT_ARGS = {
+    "num_simulations": 100,
+    "numIters": 50,
+    "numEps": 20,
+    "epochs": 5,
+    "batch_size": 64,
+    "lr": 5e-4,
+    "weight_decay": 1e-4,
+    "replay_buffer_size": 50000,
+    "value_loss_weight": 1.0,
+    "grad_clip_norm": 5.0,
+    "checkpoint_path": "latest.pth",
+    "temperature_threshold": 10,
+    "root_dirichlet_alpha": 0.3,
+    "root_exploration_fraction": 0.25,
+    "arena_compare_games": 6,
+    "arena_accept_threshold": 0.55,
+    "arena_compare_simulations": 8,
+}
+
+
+def get_device(device_arg):
+    if device_arg:
+        return device_arg
+    return "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def build_model(game, device):
+    return UltimateTicTacToeModel(
+        game.get_board_size(),
+        game.get_action_size(),
+        device,
+    )
+
+
+def load_checkpoint(model, checkpoint_path, device, optimizer=None, required=True):
+    checkpoint = Path(checkpoint_path)
+    if not checkpoint.exists():
+        if required:
+            raise FileNotFoundError(f"Checkpoint not found: {checkpoint}")
+        return False
+
+    state = torch.load(checkpoint, map_location=device)
+    model.load_state_dict(state["state_dict"])
+    if optimizer is not None and "optimizer_state_dict" in state:
+        optimizer.load_state_dict(state["optimizer_state_dict"])
+    model.eval()
+    return True
+
+
+def canonical_state(game, state, player):
+    board_data, active_board = state
+    return (game.get_canonical_board_data(board_data, player), active_board)
+
+
+def apply_moves(game, moves):
+    state = game.get_init_board()
+    player = 1
+    for action in moves:
+        next_state = game.get_next_state(state, player, action, verify_move=True)
+        if next_state is False:
+            raise ValueError(f"Illegal move in sequence: {action}")
+        state, player = next_state
+    return state, player
+
+
+def format_board(board_data):
+    symbols = {1: "X", -1: "O", 0: "."}
+    rows = []
+    for row in range(9):
+        cells = [symbols[int(board_data[row * 9 + col])] for col in range(9)]
+        groups = [" ".join(cells[idx:idx + 3]) for idx in (0, 3, 6)]
+        rows.append(" | ".join(groups))
+        if row in (2, 5):
+            rows.append("-" * 23)
+    return "\n".join(rows)
+
+
+def top_policy_moves(policy, limit):
+    ranked = np.argsort(policy)[::-1][:limit]
+    return [(int(action), float(policy[action])) for action in ranked]
+
+
+def parse_moves(text):
+    if not text:
+        return []
+    return [int(part.strip()) for part in text.split(",") if part.strip()]
+
+
+def parse_action(text):
+    raw = text.strip().replace(",", " ").split()
+    if len(raw) == 1:
+        action = int(raw[0])
+    elif len(raw) == 2:
+        row, col = (int(value) for value in raw)
+        if not (0 <= row < 9 and 0 <= col < 9):
+            raise ValueError("Row and column must be in [0, 8].")
+        action = row * 9 + col
+    else:
+        raise ValueError("Enter either a flat move index or 'row col'.")
+    if not (0 <= action < 81):
+        raise ValueError("Move index must be in [0, 80].")
+    return action
+
+
+def scalar_value(value):
+    return float(np.asarray(value).reshape(-1)[0])
+
+
+def train_command(args):
+    device = get_device(args.device)
+    game = UltimateTicTacToe()
+    model = build_model(game, device)
+
+    train_args = dict(DEFAULT_ARGS)
+    train_args.update(
+        {
+            "num_simulations": args.num_simulations,
+            "numIters": args.num_iters,
+            "numEps": args.num_eps,
+            "epochs": args.epochs,
+            "batch_size": args.batch_size,
+            "lr": args.lr,
+            "weight_decay": args.weight_decay,
+            "replay_buffer_size": args.replay_buffer_size,
+            "value_loss_weight": args.value_loss_weight,
+            "grad_clip_norm": args.grad_clip_norm,
+            "checkpoint_path": args.checkpoint,
+            "temperature_threshold": args.temperature_threshold,
+            "root_dirichlet_alpha": args.root_dirichlet_alpha,
+            "root_exploration_fraction": args.root_exploration_fraction,
+            "arena_compare_games": args.arena_compare_games,
+            "arena_accept_threshold": args.arena_accept_threshold,
+            "arena_compare_simulations": args.arena_compare_simulations,
+        }
+    )
+
+    trainer = Trainer(game, model, train_args)
+    if args.resume:
+        load_checkpoint(model, args.checkpoint, device, optimizer=trainer.optimizer)
+    trainer.learn()
+
+
+def eval_command(args):
+    device = get_device(args.device)
+    game = UltimateTicTacToe()
+    model = build_model(game, device)
+    load_checkpoint(model, args.checkpoint, device)
+
+    moves = parse_moves(args.moves)
+    state, player = apply_moves(game, moves)
+    current_state = canonical_state(game, state, player)
+    encoded = game.encode_state(current_state)
+    policy, value = model.predict(encoded)
+    legal_mask = np.array(game.get_valid_moves(state), dtype=np.float32)
+    policy = policy * legal_mask
+    if policy.sum() > 0:
+        policy = policy / policy.sum()
+
+    print("Board:")
+    print(format_board(state[0]))
+    print()
+    print(f"Side to move: {'X' if player == 1 else 'O'}")
+    print(f"Active small board: {state[1]}")
+    print(f"Model value: {scalar_value(value):.4f}")
+    print("Top policy moves:")
+    for action, prob in top_policy_moves(policy, args.top_k):
+        print(f"  {action:2d} -> {prob:.4f}")
+
+    if args.with_mcts:
+        mcts_args = dict(DEFAULT_ARGS)
+        mcts_args.update(
+            {
+                "num_simulations": args.num_simulations,
+                "root_dirichlet_alpha": None,
+                "root_exploration_fraction": None,
+            }
+        )
+        root = MCTS(game, model, mcts_args).run(model, current_state, to_play=1)
+        action = root.select_action(temperature=0)
+        print(f"MCTS best move: {action}")
+
+
+def ai_action(game, model, state, player, num_simulations):
+    current_state = canonical_state(game, state, player)
+    mcts_args = dict(DEFAULT_ARGS)
+    mcts_args.update(
+        {
+            "num_simulations": num_simulations,
+            "root_dirichlet_alpha": None,
+            "root_exploration_fraction": None,
+        }
+    )
+    root = MCTS(game, model, mcts_args).run(model, current_state, to_play=1)
+    return root.select_action(temperature=0)
+
+
+def random_action(game, state):
+    legal_actions = [index for index, allowed in enumerate(game.get_valid_moves(state)) if allowed]
+    if not legal_actions:
+        raise ValueError("No legal actions available.")
+    return int(np.random.choice(legal_actions))
+
+
+def load_player_model(game, checkpoint, device):
+    model = build_model(game, device)
+    load_checkpoint(model, checkpoint, device)
+    return model
+
+
+def choose_action(game, player_kind, model, state, player, num_simulations):
+    if player_kind == "random":
+        return random_action(game, state)
+    return ai_action(game, model, state, player, num_simulations)
+
+
+def play_match(game, x_kind, x_model, o_kind, o_model, num_simulations):
+    state = game.get_init_board()
+    player = 1
+
+    while True:
+        reward = game.get_reward_for_player(state, player)
+        if reward is not None:
+            if reward == 0:
+                return 0
+            return player if reward == 1 else -player
+
+        if player == 1:
+            action = choose_action(game, x_kind, x_model, state, player, num_simulations)
+        else:
+            action = choose_action(game, o_kind, o_model, state, player, num_simulations)
+        state, player = game.get_next_state(state, player, action)
+
+
+def arena_command(args):
+    device = get_device(args.device)
+    game = UltimateTicTacToe()
+
+    x_model = None
+    o_model = None
+    if args.x_player == "checkpoint":
+        x_model = load_player_model(game, args.x_checkpoint, device)
+    if args.o_player == "checkpoint":
+        o_model = load_player_model(game, args.o_checkpoint, device)
+
+    results = {1: 0, -1: 0, 0: 0}
+    for _ in range(args.games):
+        winner = play_match(
+            game,
+            args.x_player,
+            x_model,
+            args.o_player,
+            o_model,
+            args.num_simulations,
+        )
+        results[winner] += 1
+
+    print(f"Games: {args.games}")
+    print(f"X ({args.x_player}) wins: {results[1]}")
+    print(f"O ({args.o_player}) wins: {results[-1]}")
+    print(f"Draws: {results[0]}")
+
+
+def play_command(args):
+    device = get_device(args.device)
+    game = UltimateTicTacToe()
+    model = build_model(game, device)
+    load_checkpoint(model, args.checkpoint, device)
+
+    state = game.get_init_board()
+    player = 1
+    human_player = args.human_player
+
+    while True:
+        print()
+        print(format_board(state[0]))
+        print(f"Turn: {'X' if player == 1 else 'O'}")
+        print(f"Active small board: {state[1]}")
+
+        reward = game.get_reward_for_player(state, player)
+        if reward is not None:
+            if reward == 0:
+                print("Result: draw")
+            else:
+                winner = player if reward == 1 else -player
+                print(f"Winner: {'X' if winner == 1 else 'O'}")
+            return
+
+        valid_moves = game.get_valid_moves(state)
+        legal_actions = [index for index, allowed in enumerate(valid_moves) if allowed]
+        print(f"Legal moves: {legal_actions}")
+
+        if player == human_player:
+            while True:
+                try:
+                    action = parse_action(input("Your move (index or 'row col'): "))
+                    next_state = game.get_next_state(state, player, action, verify_move=True)
+                    if next_state is False:
+                        raise ValueError(f"Illegal move: {action}")
+                    state, player = next_state
+                    break
+                except ValueError as exc:
+                    print(exc)
+        else:
+            action = ai_action(game, model, state, player, args.num_simulations)
+            print(f"AI move: {action}")
+            state, player = game.get_next_state(state, player, action)
+
+
+def build_parser():
+    parser = argparse.ArgumentParser(description="Ultimate Tic-Tac-Toe Runner")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    train_parser = subparsers.add_parser("train", help="Train the model with self-play")
+    train_parser.add_argument("--device")
+    train_parser.add_argument("--checkpoint", default=DEFAULT_ARGS["checkpoint_path"])
+    train_parser.add_argument("--resume", action="store_true")
+    train_parser.add_argument("--num-simulations", type=int, default=DEFAULT_ARGS["num_simulations"])
+    train_parser.add_argument("--num-iters", type=int, default=DEFAULT_ARGS["numIters"])
+    train_parser.add_argument("--num-eps", type=int, default=DEFAULT_ARGS["numEps"])
+    train_parser.add_argument("--epochs", type=int, default=DEFAULT_ARGS["epochs"])
+    train_parser.add_argument("--batch-size", type=int, default=DEFAULT_ARGS["batch_size"])
+    train_parser.add_argument("--lr", type=float, default=DEFAULT_ARGS["lr"])
+    train_parser.add_argument("--weight-decay", type=float, default=DEFAULT_ARGS["weight_decay"])
+    train_parser.add_argument("--replay-buffer-size", type=int, default=DEFAULT_ARGS["replay_buffer_size"])
+    train_parser.add_argument("--value-loss-weight", type=float, default=DEFAULT_ARGS["value_loss_weight"])
+    train_parser.add_argument("--grad-clip-norm", type=float, default=DEFAULT_ARGS["grad_clip_norm"])
+    train_parser.add_argument("--temperature-threshold", type=int, default=DEFAULT_ARGS["temperature_threshold"])
+    train_parser.add_argument("--root-dirichlet-alpha", type=float, default=DEFAULT_ARGS["root_dirichlet_alpha"])
+    train_parser.add_argument("--root-exploration-fraction", type=float, default=DEFAULT_ARGS["root_exploration_fraction"])
+    train_parser.add_argument("--arena-compare-games", type=int, default=DEFAULT_ARGS["arena_compare_games"])
+    train_parser.add_argument("--arena-accept-threshold", type=float, default=DEFAULT_ARGS["arena_accept_threshold"])
+    train_parser.add_argument("--arena-compare-simulations", type=int, default=DEFAULT_ARGS["arena_compare_simulations"])
+    train_parser.set_defaults(func=train_command)
+
+    eval_parser = subparsers.add_parser("eval", help="Inspect a checkpoint on a position")
+    eval_parser.add_argument("--device")
+    eval_parser.add_argument("--checkpoint", default=DEFAULT_ARGS["checkpoint_path"])
+    eval_parser.add_argument("--moves", default="", help="Comma-separated move sequence")
+    eval_parser.add_argument("--top-k", type=int, default=10)
+    eval_parser.add_argument("--with-mcts", action="store_true")
+    eval_parser.add_argument("--num-simulations", type=int, default=DEFAULT_ARGS["num_simulations"])
+    eval_parser.set_defaults(func=eval_command)
+
+    play_parser = subparsers.add_parser("play", help="Play against the checkpoint")
+    play_parser.add_argument("--device")
+    play_parser.add_argument("--checkpoint", default=DEFAULT_ARGS["checkpoint_path"])
+    play_parser.add_argument("--human-player", type=int, choices=[1, -1], default=1)
+    play_parser.add_argument("--num-simulations", type=int, default=DEFAULT_ARGS["num_simulations"])
+    play_parser.set_defaults(func=play_command)
+
+    arena_parser = subparsers.add_parser("arena", help="Run repeated matches between agents")
+    arena_parser.add_argument("--device")
+    arena_parser.add_argument("--games", type=int, default=20)
+    arena_parser.add_argument("--num-simulations", type=int, default=DEFAULT_ARGS["num_simulations"])
+    arena_parser.add_argument("--x-player", choices=["checkpoint", "random"], default="checkpoint")
+    arena_parser.add_argument("--o-player", choices=["checkpoint", "random"], default="random")
+    arena_parser.add_argument("--x-checkpoint", default=DEFAULT_ARGS["checkpoint_path"])
+    arena_parser.add_argument("--o-checkpoint", default=DEFAULT_ARGS["checkpoint_path"])
+    arena_parser.set_defaults(func=arena_command)
+
+    return parser
+
+
+def main():
+    parser = build_parser()
+    args = parser.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/trainer.py
+++ b/trainer.py
@@ -0,0 +1,258 @@
+import os
+import math
+import copy
+import numpy as np
+from collections import deque
+from random import shuffle
+from progressbar import ProgressBar, Percentage, Bar, ETA, AdaptiveETA, FormatLabel
+
+
+import torch
+import torch.optim as optim
+
+from mcts import MCTS
+
+loss = {'ploss': float('inf'), 'pkl': float('inf'), 'vloss': float('inf'), 'current': 0, 'max': 0}
+
+class LossWidget:
+    def __call__(self, progress, data=None):
+        return (
+            f" {loss['current']}/{loss['max']} "
+            f"P.CE: {loss['ploss']:.4f}, P.KL: {loss['pkl']:.4f}, V.Loss: {loss['vloss']:.4f}"
+        )
+
+class Trainer:
+
+    def __init__(self, game, model, args):
+        self.game = game
+        self.model = model
+        self.args = args
+        self.mcts = MCTS(self.game, self.model, self.args)
+        self.optimizer = optim.Adam(
+            self.model.parameters(),
+            lr=self.args.get('lr', 5e-4),
+            weight_decay=self.args.get('weight_decay', 1e-4),
+        )
+        self.replay_buffer = deque(maxlen=self.args.get('replay_buffer_size', 50000))
+
+    def _ai_action(self, model, state, player):
+        board_data, active_board = state
+        canonical_state = (self.game.get_canonical_board_data(board_data, player), active_board)
+        mcts_args = dict(self.args)
+        mcts_args.update(
+            {
+                'num_simulations': self.args.get('arena_compare_simulations', self.args['num_simulations']),
+                'root_dirichlet_alpha': None,
+                'root_exploration_fraction': None,
+            }
+        )
+        root = MCTS(self.game, model, mcts_args).run(model, canonical_state, to_play=1)
+        return root.select_action(temperature=0)
+
+    def _play_arena_game(self, x_model, o_model):
+        state = self.game.get_init_board()
+        current_player = 1
+
+        while True:
+            reward = self.game.get_reward_for_player(state, current_player)
+            if reward is not None:
+                if reward == 0:
+                    return 0
+                return current_player if reward == 1 else -current_player
+
+            model = x_model if current_player == 1 else o_model
+            action = self._ai_action(model, state, current_player)
+            state, current_player = self.game.get_next_state(state, current_player, action)
+
+    def evaluate_candidate(self, candidate_model, reference_model):
+        games = self.args.get('arena_compare_games', 0)
+        if games <= 0:
+            return True, 0.5
+
+        candidate_points = 0.0
+        required_points = self.args.get('arena_accept_threshold', 0.55) * games
+        candidate_first_games = (games + 1) // 2
+        candidate_second_games = games // 2
+        games_played = 0
+
+        for _ in range(candidate_first_games):
+            winner = self._play_arena_game(candidate_model, reference_model)
+            if winner == 1:
+                candidate_points += 1.0
+            elif winner == 0:
+                candidate_points += 0.5
+            games_played += 1
+            remaining_games = games - games_played
+            if candidate_points >= required_points:
+                return True, candidate_points / games
+            if candidate_points + remaining_games < required_points:
+                return False, candidate_points / games
+
+        for _ in range(candidate_second_games):
+            winner = self._play_arena_game(reference_model, candidate_model)
+            if winner == -1:
+                candidate_points += 1.0
+            elif winner == 0:
+                candidate_points += 0.5
+            games_played += 1
+            remaining_games = games - games_played
+            if candidate_points >= required_points:
+                return True, candidate_points / games
+            if candidate_points + remaining_games < required_points:
+                return False, candidate_points / games
+
+        score = candidate_points / games
+        return score >= self.args.get('arena_accept_threshold', 0.55), score
+
+    def exceute_episode(self):
+
+        train_examples = []
+        current_player = 1
+        state = self.game.get_init_board()
+        episode_step = 0
+
+        while True:
+            board_data, state_inner_board = state
+            cannonical_board_data = self.game.get_canonical_board_data(board_data, current_player)
+            canonical_board = (cannonical_board_data, state_inner_board)
+
+            self.mcts = MCTS(self.game, self.model, self.args)
+            root = self.mcts.run(self.model, canonical_board, to_play=1)
+
+            action_probs = np.zeros(self.game.get_action_size(), dtype=np.float32)
+            for k, v in root.children.items():
+                action_probs[k] = v.visit_count
+
+            action_probs = action_probs / np.sum(action_probs)
+            encoded_state = self.game.encode_state(canonical_board)
+            train_examples.append((encoded_state, current_player, action_probs))
+
+            temperature_threshold = self.args.get('temperature_threshold', 10)
+            temperature = 1 if episode_step < temperature_threshold else 0
+            action = root.select_action(temperature=temperature)
+            state, current_player = self.game.get_next_state(state, current_player, action)
+            reward = self.game.get_reward_for_player(state, current_player)
+            episode_step += 1
+
+            if reward is not None:
+                ret = []
+                for hist_state, hist_current_player, hist_action_probs in train_examples:
+                    # [Board, currentPlayer, actionProbabilities, Reward]
+                    ret.append((hist_state, hist_action_probs, reward * ((-1) ** (hist_current_player != current_player))))
+
+                return ret
+
+    def learn(self):
+        widgets = [Percentage(), Bar(), AdaptiveETA(), LossWidget()]
+        pbar = ProgressBar(max_value=self.args['numIters'], widgets=widgets)
+        pbar.update(0)
+        for i in range(1, self.args['numIters'] + 1):
+
+            # print("{}/{}".format(i, self.args['numIters']))
+
+            train_examples = []
+
+            for eps in range(self.args['numEps']):
+                iteration_train_examples = self.exceute_episode()
+                train_examples.extend(iteration_train_examples)
+
+            self.replay_buffer.extend(train_examples)
+            training_examples = list(self.replay_buffer)
+            shuffle(training_examples)
+            shuffle(train_examples)
+
+            reference_model = copy.deepcopy(self.model)
+            reference_model.eval()
+            reference_state_dict = copy.deepcopy(self.model.state_dict())
+            reference_optimizer_state = copy.deepcopy(self.optimizer.state_dict())
+
+            results = self.train(training_examples)
+            accepted, arena_score = self.evaluate_candidate(self.model, reference_model)
+            if accepted:
+                filename = self.args['checkpoint_path']
+                self.save_checkpoint(folder=".", filename=filename)
+            else:
+                self.model.load_state_dict(reference_state_dict)
+                self.optimizer.load_state_dict(reference_optimizer_state)
+
+            # print((float(results[0]), float(results[1])))
+            loss['ploss'] = float(results[0])
+            loss['pkl'] = float(results[1])
+            loss['vloss'] = float(results[2])
+            loss['current'] = i
+            loss['max'] = self.args['numIters']
+            # loss_widget.update(float(results[0]), float(results[1]))
+            pbar.update(i)
+        pbar.finish()
+
+    def train(self, examples):
+        pi_losses = []
+        pi_kls = []
+        v_losses = []
+        device = self.model.device
+
+        for epoch in range(self.args['epochs']):
+            self.model.train()
+            shuffled_indices = np.random.permutation(len(examples))
+
+            for batch_start in range(0, len(examples), self.args['batch_size']):
+                sample_ids = shuffled_indices[batch_start:batch_start + self.args['batch_size']]
+                boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
+                boards = torch.FloatTensor(np.array(boards).astype(np.float32))
+                target_pis = torch.FloatTensor(np.array(pis))
+                target_vs = torch.FloatTensor(np.array(vs).astype(np.float32))
+
+                # predict
+                boards = boards.contiguous().to(device)
+                target_pis = target_pis.contiguous().to(device)
+                target_vs = target_vs.contiguous().to(device)
+
+                # compute output
+                out_pi, out_v = self.model(boards)
+                l_pi = self.loss_pi(target_pis, out_pi)
+                l_pi_kl = self.loss_pi_kl(target_pis, out_pi)
+                l_v = self.loss_v(target_vs, out_v)
+                total_loss = l_pi_kl + self.args.get('value_loss_weight', 1.0) * l_v
+
+                pi_losses.append(float(l_pi.detach()))
+                pi_kls.append(float(l_pi_kl.detach()))
+                v_losses.append(float(l_v.detach()))
+
+                self.optimizer.zero_grad()
+                total_loss.backward()
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.get('grad_clip_norm', 5.0))
+                self.optimizer.step()
+
+        # print()
+        # print("Policy Loss", np.mean(pi_losses))
+        # print("Value Loss", np.mean(v_losses))
+
+        return (np.mean(pi_losses), np.mean(pi_kls), np.mean(v_losses))
+        # print("Examples:")
+        # print(out_pi[0].detach())
+        # print(target_pis[0])
+
+    def loss_pi(self, targets, outputs):
+        loss = -(targets * torch.log(outputs.clamp_min(1e-8))).sum(dim=1)
+        return loss.mean()
+
+    def loss_pi_kl(self, targets, outputs):
+        target_log = torch.log(targets.clamp_min(1e-8))
+        output_log = torch.log(outputs.clamp_min(1e-8))
+        loss = (targets * (target_log - output_log)).sum(dim=1)
+        return loss.mean()
+
+    def loss_v(self, targets, outputs):
+        loss = torch.sum((targets-outputs.view(-1))**2)/targets.size()[0]
+        return loss
+
+    def save_checkpoint(self, folder, filename):
+        if not os.path.exists(folder):
+            os.mkdir(folder)
+
+        filepath = os.path.join(folder, filename)
+        torch.save({
+            'state_dict': self.model.state_dict(),
+            'optimizer_state_dict': self.optimizer.state_dict(),
+            'args': self.args,
+        }, filepath)