import pytest
import numpy as np
from agents.offline_agent import QLearningAgent

@pytest.fixture
def agent_params():
    return {
        'learning_rate': 0.1,
        'discount_factor': 0.99,
        'epsilon': 0.0  # Deterministic for testing
    }

@pytest.fixture
def agent(agent_params):
    return QLearningAgent(agent_params, state_size=4, action_size=3)

def test_agent_initialization(agent):
    """Test agent initialization"""
    assert agent.state_size == 4
    assert agent.action_size == 3
    assert agent.lr == 0.1
    assert agent.gamma == 0.99
    assert agent.q_table.shape == (4, 3)
    assert np.all(agent.q_table == 0)  # Q-table should be initialized to zeros

def test_get_action_with_mask(agent):
    """Test action selection with action masking"""
    # Set up known Q-values
    agent.q_table[0] = np.array([1.0, 2.0, 3.0])
    
    # Test without mask
    action = agent.get_action(0)
    assert action == 2  # Should choose highest Q-value
    
    # Test with mask
    action_mask = np.array([1, 1, 0])  # Mask out the highest value
    action = agent.get_action(0, action_mask)
    assert action == 1  # Should choose second highest value

def test_episode_tracking(agent):
    """Test action tracking within an episode"""
    agent.q_table[0] = np.array([1.0, 2.0, 3.0])
    
    # Take all possible actions
    actions = []
    for _ in range(agent.action_size):
        action = agent.get_action(0)
        assert action is not None
        actions.append(action)
    
    # Verify all actions were unique
    assert len(set(actions)) == agent.action_size
    
    # Next action should be None as all actions are taken
    assert agent.get_action(0) is None

def test_episode_reset(agent):
    """Test episode reset functionality"""
    agent.q_table[0] = np.array([1.0, 2.0, 3.0])
    
    # Take some actions
    agent.get_action(0)
    agent.get_action(0)
    
    # Reset episode
    agent.reset_episode()
    
    # Should be able to take the best action again
    action = agent.get_action(0)
    assert action == 2  # Highest Q-value action

def test_learning(agent):
    """Test Q-learning update"""
    # Create a simple batch
    batch = {
        'observations': np.array([0]),
        'actions': np.array([1]),
        'rewards': np.array([1.0]),
        'next_observations': np.array([1]),
        'terminals': np.array([False])
    }
    
    # Set up known Q-values
    agent.q_table[1] = np.array([0.5, 0.8, 0.3])  # Next state Q-values
    old_value = agent.q_table[0, 1]
    
    # Perform learning update
    agent.learn(batch)
    
    # Check if Q-value was updated correctly
    # Q(s,a) = Q(s,a) + lr * (R + gamma * max(Q(s')) - Q(s,a))
    expected_value = old_value + agent.lr * (1.0 + agent.gamma * 0.8 - old_value)
    assert np.isclose(agent.q_table[0, 1], expected_value)

def test_save_and_load(agent, tmp_path):
    """Test model saving and loading"""
    # Set some Q-values
    agent.q_table[0] = np.array([1.0, 2.0, 3.0])
    
    # Save model
    save_path = tmp_path / "q_table.npy"
    agent.save_model(save_path)
    
    # Create new agent and load model
    new_agent = QLearningAgent(agent_params(), state_size=4, action_size=3)
    new_agent.load_q_table(save_path)
    
    # Check if Q-values match
    assert np.all(agent.q_table == new_agent.q_table)