import os
import sys
from pathlib import Path

def change_to_repo_root(marker: str = "src") -> None:
    """Change CWD to the repository root (parent of `src`)."""
    here = Path.cwd()
    for parent in [here] + list(here.parents):
        if (parent / marker).is_dir():
            os.chdir(parent)
            break

change_to_repo_root()
print("CWD set to:", os.getcwd())

src_path = Path("src").resolve()
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
print("sys.path[0]:", sys.path[0])

CWD set to: c:\Users\nly99857\OneDrive - Philips\SW Projects\QSeaBattle
sys.path[0]: C:\Users\nly99857\OneDrive - Philips\SW Projects\QSeaBattle\src

import random
import math
import numpy as np
import pandas as pd
import tensorflow as tf

import Q_Sea_Battle as qsb
from Q_Sea_Battle.dru_utils import dru_train
from Q_Sea_Battle.reference_performance_utilities import limit_from_mutual_information

SEED = 1232
np.random.seed(SEED)
random.seed(SEED * 2)
tf.random.set_seed(SEED * 4)

tf.config.run_functions_eagerly(True)

print("NumPy random seed:", SEED)
print("Python random seed:", SEED * 2)
print("TF random seed:", SEED * 4)
print("TensorFlow version:", tf.__version__)

NumPy random seed: 1232
Python random seed: 2464
TF random seed: 4928
TensorFlow version: 2.20.0

FIELD_SIZES = [4, 8, 16, 32]
COMMS_SIZES = [1,2,4,8]

# RL hyperparameters (can be tuned)
NUM_EPOCHS = 128
BATCHES_PER_EPOCH = 40
BATCH_SIZE = 2048*16
SIGMA_TRAIN = 2.0
CLIP_RANGE = (-10.0, 10.0)
LEARNING_RATE = 1e-3

# Sigma annealing for DRU: high noise -> low noise over epochs
SIGMA_START = 2.0
SIGMA_END = 0.3

N_GAMES_TOURNAMENT = 10000

from Q_Sea_Battle.tournament import Tournament
from Q_Sea_Battle.majority_players import MajorityPlayers
from Q_Sea_Battle.neural_net_players import NeuralNetPlayers


def sample_batch(layout: qsb.GameLayout, batch_size: int):
    """Sample (fields, guns, cell_values) for the given layout.

    - fields: shape (B, n²), Bernoulli(enemy_probability)
    - guns:   shape (B, n²), one-hot
    - cell_values: shape (B, 1), field value at the gun index
    """
    n2 = layout.field_size ** 2
    p = layout.enemy_probability

    fields = np.random.binomial(1, p, size=(batch_size, n2)).astype("float32")

    guns = np.zeros((batch_size, n2), dtype="float32")
    gun_indices = np.random.randint(0, n2, size=(batch_size,))
    guns[np.arange(batch_size), gun_indices] = 1.0

    cell_values = (fields * guns).sum(axis=1, keepdims=True).astype("float32")
    return fields, guns, cell_values


def evaluate_players_in_tournament(
    layout: qsb.GameLayout,
    players_factory,
    label: str = "",
) -> float:
    """Run a tournament and return mean reward.

    `players_factory` is any Players subclass instance, e.g.
    MajorityPlayers(layout) or NeuralNetPlayers(layout, ...).
    """
    game_env = qsb.GameEnv(game_layout=layout)
    tournament = Tournament(game_env=game_env, players=players_factory, game_layout=layout)
    log = tournament.tournament()
    mean_reward, std_err = log.outcome()

    if label:
        print(
            f"{label}: mean reward = {mean_reward:.4f} ± {std_err:.4f} "
        )
    else:
        print(
            f"mean reward = {mean_reward:.4f} ± {std_err:.4f} "
        )

    return mean_reward

def dial_pg_update(
    model_a: tf.keras.Model,
    model_b: tf.keras.Model,
    optimizer: tf.keras.optimizers.Optimizer,
    layout: qsb.GameLayout,
    batch_size: int = 512,
    sigma: float = 2.0,
    clip_range=(-10.0, 10.0),
    entropy_coeff: float = 0.01,
    normalize_adv: bool = True,
) -> tuple[float, float]:
    """Improved DIAL-style policy-gradient update step with DRU.

    Compared to the earlier version, this adds:
    - optional advantage normalization (variance reduction),
    - an entropy bonus on the shoot policy.
    """
    fields_np, guns_np, cell_values_np = sample_batch(layout, batch_size)

    fields_tf = tf.convert_to_tensor(fields_np, dtype=tf.float32)
    fields_scaled = fields_tf - 0.5  # scale {0,1} -> [-0.5, 0.5] to match NeuralNetPlayerA
    guns_tf = tf.convert_to_tensor(guns_np, dtype=tf.float32)
    cell_values_tf = tf.convert_to_tensor(cell_values_np, dtype=tf.float32)

    n2 = layout.field_size ** 2
    denom = float(max(1, n2 - 1))
    gun_indices = tf.argmax(guns_tf, axis=1, output_type=tf.int32)
    gun_idx_norm = tf.cast(gun_indices, tf.float32) / denom
    gun_idx_norm = tf.reshape(gun_idx_norm, (-1, 1))


    eps = 1e-8

    with tf.GradientTape() as tape:
        # A produces communication logits
        comm_logits = model_a(fields_scaled, training=True)          # (B, m)

        # DRU (train mode): logits + noise -> logistic
        comm_cont = dru_train(comm_logits, sigma=sigma, clip_range=clip_range)
        comm_cont = tf.cast(comm_cont, tf.float32)               # (B, m) in (0,1)

        # B receives gun + continuous comm
        x_b = tf.concat([gun_idx_norm, comm_cont], axis=1)          # (B, 1 + m)
        shoot_logits = model_b(x_b, training=True)               # (B, 1)

        probs = tf.nn.sigmoid(shoot_logits)
        rnd = tf.random.uniform(tf.shape(probs))
        actions = tf.cast(rnd < probs, tf.float32)               # (B, 1) in {0,1}

        # Team reward: 1 if correct guess of the field bit at the gun index
        rewards = tf.cast(tf.equal(actions, cell_values_tf), tf.float32)

        # Baseline and advantage
        baseline = tf.reduce_mean(rewards)
        advantages = rewards - baseline

        if normalize_adv:
            adv_std = tf.math.reduce_std(advantages) + 1e-8
            advantages = advantages / adv_std

        advantages = tf.stop_gradient(advantages)

        # Log-prob of the sampled action under Bernoulli(probs)
        log_probs = (
            actions * tf.math.log(probs + eps)
            + (1.0 - actions) * tf.math.log(1.0 - probs + eps)
        )

        # Policy entropy for Bernoulli(probs)
        entropy = -(
            probs * tf.math.log(probs + eps)
            + (1.0 - probs) * tf.math.log(1.0 - probs + eps)
        )

        # REINFORCE loss with entropy regularization
        loss_pg = -tf.reduce_mean(log_probs * advantages)
        loss_ent = -tf.reduce_mean(entropy)   # negative so adding pushes up entropy
        loss = loss_pg + entropy_coeff * loss_ent

    params = model_a.trainable_variables + model_b.trainable_variables
    grads = tape.gradient(loss, params)
    optimizer.apply_gradients(zip(grads, params))

    mean_reward = float(tf.reduce_mean(rewards).numpy())
    loss_value = float(loss.numpy())
    return mean_reward, loss_value

def sigma_for_epoch(epoch: int, num_epochs: int) -> float:
    """Linearly interpolate sigma from SIGMA_START to SIGMA_END over epochs."""
    t = epoch / max(1, num_epochs)
    return SIGMA_START * (1.0 - t) + SIGMA_END * t


print("\nRunning updated sweep with advantage normalization, entropy bonus, and sigma annealing...")

results_updated = []

for n in FIELD_SIZES:
    for m in COMMS_SIZES:
        print(f"Field_size = {n}, comms_size = {m}")

        layout = qsb.GameLayout(
            field_size=n,
            comms_size=m,
            enemy_probability=0.5,
            channel_noise=0.0,
            number_of_games_in_tournament=N_GAMES_TOURNAMENT,
        )

        # Majority players as before
        majority_players = MajorityPlayers(layout)
        maj_mean = evaluate_players_in_tournament(
            layout, majority_players,
            label="\tMajorityPlayers"
        )

        # Fresh NeuralNetPlayers and models
        nn_players = NeuralNetPlayers(game_layout=layout, explore=True)
        player_a, player_b = nn_players.players()
        model_a = nn_players.model_a
        model_b = nn_players.model_b
        assert model_a is not None and model_b is not None

        optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

        for epoch in range(1, NUM_EPOCHS + 1):
            sigma_now = sigma_for_epoch(epoch, NUM_EPOCHS)
            epoch_rewards = []
            epoch_losses = []

            for _ in range(BATCHES_PER_EPOCH):
                r, l = dial_pg_update(
                    model_a=model_a,
                    model_b=model_b,
                    optimizer=optimizer,
                    layout=layout,
                    batch_size=BATCH_SIZE,
                    sigma=sigma_now,
                    clip_range=CLIP_RANGE,
                    entropy_coeff=0.01,
                    normalize_adv=True,
                )
                epoch_rewards.append(r)
                epoch_losses.append(l)

        # Evaluate trained neural nets (greedy play)
        nn_players_eval = NeuralNetPlayers(
            game_layout=layout,
            model_a=model_a,
            model_b=model_b,
            explore=False,
        )
        nn_mean = evaluate_players_in_tournament(
            layout, nn_players_eval,
            label="\tNeuralNetPlayers (DIAL+DRU)"
        )

        info_limit = limit_from_mutual_information(
            field_size=n,
            comms_size=m,
            channel_noise=0.0,
            accuracy_in_digits=10,
        )

        print(
            f"\tInfo-theoretic upper bound (noiseless channel): {info_limit:.4f}"
        )

        results_updated.append(
            {
                "field_size": n,
                "comms_size": m,
                "maj_mean": maj_mean,
                "nn_dial_mean": nn_mean,
                "info_limit": info_limit,
            }
        )

        # Store trained models for this (field_size, comms_size) setting
        filename_a = f"notebooks/models/neural_net_model_a_f{n}_c{m}.keras"
        filename_b = f"notebooks/models/neural_net_model_b_f{n}_c{m}.keras"
        nn_players.store_models(filename_a, filename_b)
        print(f"Saved models to {filename_a} and {filename_b}")

Running updated sweep with advantage normalization, entropy bonus, and sigma annealing...
Field_size = 4, comms_size = 1
	MajorityPlayers: mean reward = 0.5988 ± 0.0049 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.6037 ± 0.0049 
	Info-theoretic upper bound (noiseless channel): 0.6461
Saved models to notebooks/models/neural_net_model_a_f4_c1.keras and notebooks/models/neural_net_model_b_f4_c1.keras
Field_size = 4, comms_size = 2
	MajorityPlayers: mean reward = 0.6442 ± 0.0048 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.6231 ± 0.0048 
	Info-theoretic upper bound (noiseless channel): 0.7051
Saved models to notebooks/models/neural_net_model_a_f4_c2.keras and notebooks/models/neural_net_model_b_f4_c2.keras
Field_size = 4, comms_size = 4
	MajorityPlayers: mean reward = 0.6823 ± 0.0047 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.6784 ± 0.0047 
	Info-theoretic upper bound (noiseless channel): 0.7855
Saved models to notebooks/models/neural_net_model_a_f4_c4.keras and notebooks/models/neural_net_model_b_f4_c4.keras
Field_size = 4, comms_size = 8
	MajorityPlayers: mean reward = 0.7422 ± 0.0044 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.6498 ± 0.0048 
	Info-theoretic upper bound (noiseless channel): 0.8900
Saved models to notebooks/models/neural_net_model_a_f4_c8.keras and notebooks/models/neural_net_model_b_f4_c8.keras
Field_size = 8, comms_size = 1
	MajorityPlayers: mean reward = 0.5541 ± 0.0050 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5540 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.5735
Saved models to notebooks/models/neural_net_model_a_f8_c1.keras and notebooks/models/neural_net_model_b_f8_c1.keras
Field_size = 8, comms_size = 2
	MajorityPlayers: mean reward = 0.5755 ± 0.0049 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5628 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.6037
Saved models to notebooks/models/neural_net_model_a_f8_c2.keras and notebooks/models/neural_net_model_b_f8_c2.keras
Field_size = 8, comms_size = 4
	MajorityPlayers: mean reward = 0.5955 ± 0.0049 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5801 ± 0.0049 
	Info-theoretic upper bound (noiseless channel): 0.6461
Saved models to notebooks/models/neural_net_model_a_f8_c4.keras and notebooks/models/neural_net_model_b_f8_c4.keras
Field_size = 8, comms_size = 8
	MajorityPlayers: mean reward = 0.6388 ± 0.0048 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5580 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.7051
Saved models to notebooks/models/neural_net_model_a_f8_c8.keras and notebooks/models/neural_net_model_b_f8_c8.keras
Field_size = 16, comms_size = 1
	MajorityPlayers: mean reward = 0.5287 ± 0.0050 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5238 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.5368
Saved models to notebooks/models/neural_net_model_a_f16_c1.keras and notebooks/models/neural_net_model_b_f16_c1.keras
Field_size = 16, comms_size = 2
	MajorityPlayers: mean reward = 0.5311 ± 0.0050 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5284 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.5520
Saved models to notebooks/models/neural_net_model_a_f16_c2.keras and notebooks/models/neural_net_model_b_f16_c2.keras
Field_size = 16, comms_size = 4
	MajorityPlayers: mean reward = 0.5596 ± 0.0050 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5337 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.5735
Saved models to notebooks/models/neural_net_model_a_f16_c4.keras and notebooks/models/neural_net_model_b_f16_c4.keras
Field_size = 16, comms_size = 8
	MajorityPlayers: mean reward = 0.5662 ± 0.0050 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5424 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.6037
Saved models to notebooks/models/neural_net_model_a_f16_c8.keras and notebooks/models/neural_net_model_b_f16_c8.keras
Field_size = 32, comms_size = 1
	MajorityPlayers: mean reward = 0.5129 ± 0.0050 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5182 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.5184
Saved models to notebooks/models/neural_net_model_a_f32_c1.keras and notebooks/models/neural_net_model_b_f32_c1.keras
Field_size = 32, comms_size = 2
	MajorityPlayers: mean reward = 0.5194 ± 0.0050 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5131 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.5260
Saved models to notebooks/models/neural_net_model_a_f32_c2.keras and notebooks/models/neural_net_model_b_f32_c2.keras
Field_size = 32, comms_size = 4
	MajorityPlayers: mean reward = 0.5161 ± 0.0050 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5137 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.5368
Saved models to notebooks/models/neural_net_model_a_f32_c4.keras and notebooks/models/neural_net_model_b_f32_c4.keras
Field_size = 32, comms_size = 8
	MajorityPlayers: mean reward = 0.5301 ± 0.0050 
	NeuralNetPlayers (DIAL+DRU): mean reward = 0.5253 ± 0.0050 
	Info-theoretic upper bound (noiseless channel): 0.5520
Saved models to notebooks/models/neural_net_model_a_f32_c8.keras and notebooks/models/neural_net_model_b_f32_c8.keras

df_results_updated = pd.DataFrame(results_updated)
df_results_updated

Tutorial: DIAL‑trained Neural Models¶

Comparing DIAL‑trained Neural Models, Majority Baselines and Information-Theoretic Limits¶

Summary of Results¶

Results for comms_size = 1¶

Results for comms_size = 2¶

Results for comms_size = 4¶

Full table¶

Overview of the Experimental Procedure¶

DIAL + DRU Training Helpers¶

Conclusion¶

Result from 840 mins run¶

Settings¶

¶

Table¶

field_size	comms_size	maj_mean	nn_dial_mean	info_limit
4	1	0.5988	0.6037	0.6461
8	1	0.5526	0.5475	0.5735
16	1	0.5270	0.5284	0.5368
32	1	0.5169	0.5093	0.5184

field_size	comms_size	maj_mean	nn_dial_mean	info_limit
4	2	0.6360	0.6122	0.7051
8	2	0.5770	0.5607	0.6037
16	2	0.5354	0.5401	0.5520
32	2	0.5008	0.5157	0.5260

field_size	comms_size	maj_mean	nn_dial_mean	info_limit
4	4	0.6885	0.6222	0.7855
8	4	0.5994	0.5560	0.6461
16	4	0.5574	0.5202	0.5735
32	4	0.5236	0.5147	0.5368

	field_size	comms_size	maj_mean	nn_dial_mean	info_limit
0	4	1	0.5988	0.6037	0.646103
1	4	2	0.6442	0.6231	0.705074
2	4	4	0.6823	0.6784	0.785498
3	4	8	0.7422	0.6498	0.889972
4	8	1	0.5541	0.5540	0.573455
5	8	2	0.5755	0.5628	0.603692
6	8	4	0.5955	0.5801	0.646103
7	8	8	0.6388	0.5580	0.705074
8	16	1	0.5287	0.5238	0.536777
9	16	2	0.5311	0.5284	0.551988
10	16	4	0.5596	0.5337	0.573455
11	16	8	0.5662	0.5424	0.603692
12	32	1	0.5129	0.5182	0.518395
13	32	2	0.5194	0.5131	0.526011
14	32	4	0.5161	0.5137	0.536777
15	32	8	0.5301	0.5253	0.551988