From a81e1d4232cc429496002771a4ae24aefb95aa9c Mon Sep 17 00:00:00 2001 From: mgjeon Date: Mon, 22 Sep 2025 16:36:07 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20=ED=95=99=EC=8A=B5=20=EB=B0=8F=20?= =?UTF-8?q?=ED=8F=89=EA=B0=80=20=ED=94=84=EB=A1=9C=EC=84=B8=EC=8A=A4=20?= =?UTF-8?q?=EA=B0=9C=EC=84=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - data_collector.py: JSON 형식 로깅 추가 - train_offline.py: Q-table 저장 형식 개선 - evaluate.py: 평가 지표 상세화 - usecases/: 평가 로직 개선 - tests/: 테스트 케이스 업데이트 --- data_collector.py | 93 ++++++++++++++++++---------- evaluate.py | 91 +++++++++++++++++---------- tests/test_evaluate_agent_usecase.py | 4 +- train_offline.py | 65 +++++++++++++++++-- usecases/evaluate_agent_usecase.py | 4 +- 5 files changed, 181 insertions(+), 76 deletions(-) diff --git a/data_collector.py b/data_collector.py index 8f1a67f..0f8bd65 100644 --- a/data_collector.py +++ b/data_collector.py @@ -1,49 +1,74 @@ -import h5py import numpy as np import yaml +import json +import os +from datetime import datetime -from envs.my_custom_env import MyCustomEnv +from negotiation_agent.environment import NegotiationEnv +from negotiation_agent.spaces import NegotiationSpaces def main(): with open("configs/offline_env_config.yaml", "r") as f: config = yaml.safe_load(f) - env = MyCustomEnv() - dataset_path = config["dataset_params"]["path"] - + env = NegotiationEnv() + spaces = NegotiationSpaces() + num_episodes = 10 max_steps_per_episode = 100 - - with h5py.File(dataset_path, 'w') as f: - observations = [] - actions = [] - rewards = [] - next_observations = [] - terminals = [] - - for episode in range(num_episodes): - obs, _ = env.reset() - for step in range(max_steps_per_episode): - action = env.action_space.sample() - next_obs, reward, terminated, _, _ = env.step(action) - - observations.append(obs) - actions.append(action) - rewards.append(reward) - next_observations.append(next_obs) - terminals.append(terminated) - - obs = next_obs - - if terminated: - break + + # 데이터를 저장할 리스트 + episodes_data = [] + + for episode in range(num_episodes): + episode_data = { + "episode_id": episode, + "timestamp": datetime.now().isoformat(), + "steps": [] + } - f.create_dataset("observations", data=np.array(observations)) - f.create_dataset("actions", data=np.array(actions)) - f.create_dataset("rewards", data=np.array(rewards)) - f.create_dataset("next_observations", data=np.array(next_observations)) - f.create_dataset("terminals", data=np.array(terminals)) + obs, _ = env.reset() + episode_reward = 0 + + for step in range(max_steps_per_episode): + # 행동 선택 및 환경과 상호작용 + action = env.action_space.sample() + next_obs, reward, terminated, _, _ = env.step(action) + episode_reward += reward + + # 스텝 데이터 저장 + step_data = { + "step": step, + "state": spaces.get_state_description(obs), + "action": spaces.get_action_description(action), + "reward": float(reward), + "next_state": spaces.get_state_description(next_obs), + "current_price": float(env.current_price), + "terminated": terminated + } + episode_data["steps"].append(step_data) + + obs = next_obs + if terminated: + break + + episode_data["total_reward"] = float(episode_reward) + episode_data["num_steps"] = len(episode_data["steps"]) + episodes_data.append(episode_data) + + # JSON 파일로 저장 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + json_path = f"logs/collected_data_{timestamp}.json" + os.makedirs("logs", exist_ok=True) + + with open(json_path, 'w', encoding='utf-8') as f: + json.dump(episodes_data, f, ensure_ascii=False, indent=2) + + print(f"Data collected and saved to {json_path}") + print(f"Total episodes: {len(episodes_data)}") + print(f"Average steps per episode: {sum(ep['num_steps'] for ep in episodes_data) / len(episodes_data):.2f}") + print(f"Average reward per episode: {sum(ep['total_reward'] for ep in episodes_data) / len(episodes_data):.2f}") if __name__ == "__main__": main() diff --git a/evaluate.py b/evaluate.py index 796e043..6f10fb3 100644 --- a/evaluate.py +++ b/evaluate.py @@ -1,41 +1,66 @@ from negotiation_agent.environment import NegotiationEnv -from negotiation_agent.agent import QLearningAgent -import config +from agents.offline_agent import QLearningAgent +import yaml +import numpy as np -def evaluate(): +def main(): + # 환경 설정 로드 + with open('configs/offline_env_config.yaml', 'r') as f: + config = yaml.safe_load(f) + + # 환경 초기화 env = NegotiationEnv( - scenario=config.SCENARIO, - target_price=config.TARGET_PRICE, - threshold_price=config.THRESHOLD_PRICE, + scenario=config['env']['scenario'], + target_price=config['env']['target_price'], + threshold_price=config['env']['threshold_price'] ) - - # 에이전트를 생성하되, 학습된 Q-Table을 불러옵니다. - agent = QLearningAgent( - state_dims=env.observation_space.nvec, - action_size=env.action_space.n, - learning_rate=0, # 평가 시에는 학습하지 않음 - gamma=0, - epsilon=0, # 평가 시에는 탐험하지 않고 최선의 행동만 선택 - ) - agent.load_q_table(config.Q_TABLE_SAVE_PATH) - - print("--- 학습된 에이전트 평가 시작 ---") - state, info = env.reset() - terminated = False - total_reward = 0 - - while not terminated: - action = agent.get_action(state) - state, reward, terminated, truncated, info = env.step(action) - total_reward += reward - print(f"상태: {state}, 선택한 행동: {action}, 보상: {reward:.4f}") - - print("\n✅ 평가 종료!") - print(f"최종 협상 가격: {env.current_price:.2f} (목표가: {env.target_price})") - print(f"총 보상: {total_reward:.4f}") - env.close() + + # 에이전트 초기화 및 Q-table 로드 + state_dims = env.observation_space.nvec + state_size = np.prod(state_dims) # 전체 상태 공간 크기 + action_size = env.action_space.n + agent = QLearningAgent(config['agent'], state_size, action_size) + agent.load_q_table('saved_models/q_table.npy') + + print(f"State space size: {state_size}") + print(f"Action space size: {action_size}") + print(f"Q-table shape: {agent.q_table.shape}") + + # 평가 실행 + num_episodes = 10 + total_rewards = [] + + for episode in range(num_episodes): + state, _ = env.reset() + episode_reward = 0 + done = False + + while not done: + # 상태를 인덱스로 변환 + state_idx = np.ravel_multi_index(tuple(state), env.observation_space.nvec) + # 최적의 행동 선택 + action = np.argmax(agent.q_table[state_idx]) + + # 환경에서 한 스텝 진행 + next_state, reward, done, _, _ = env.step(action) + episode_reward += reward + state = next_state + + # 현재 상태 출력 + print(f"Episode {episode + 1}") + print(f"State: {env.spaces.get_state_description(state)}") + print(f"Action: {env.spaces.get_action_description(action)}") + print(f"Reward: {reward:.2f}") + print(f"Current Price: {env.current_price:.2f}") + print("--------------------") + + total_rewards.append(episode_reward) + print(f"Episode {episode + 1} finished with total reward: {episode_reward:.2f}") + print("========================================") + + print(f"Average reward over {num_episodes} episodes: {np.mean(total_rewards):.2f}") if __name__ == "__main__": - evaluate() + main() diff --git a/tests/test_evaluate_agent_usecase.py b/tests/test_evaluate_agent_usecase.py index e22c0d2..5dbf26f 100644 --- a/tests/test_evaluate_agent_usecase.py +++ b/tests/test_evaluate_agent_usecase.py @@ -1,7 +1,7 @@ import unittest from agents.offline_agent import QLearningAgent -from envs.my_custom_env import MyCustomEnv +from negotiation_agent.environment import NegotiationEnv from usecases.evaluate_agent_usecase import EvaluateAgentUseCase class TestEvaluateAgentUseCase(unittest.TestCase): @@ -10,7 +10,7 @@ class TestEvaluateAgentUseCase(unittest.TestCase): self.state_size = 10 self.action_size = 2 self.agent = QLearningAgent(self.agent_params, self.state_size, self.action_size) - self.env = MyCustomEnv() + self.env = NegotiationEnv() self.use_case = EvaluateAgentUseCase() def test_execute(self): diff --git a/train_offline.py b/train_offline.py index 02b1be3..f5559a1 100644 --- a/train_offline.py +++ b/train_offline.py @@ -2,8 +2,12 @@ import h5py import numpy as np import yaml import os +import json +from datetime import datetime from agents.offline_agent import QLearningAgent +from negotiation_agent.spaces import NegotiationSpaces +from negotiation_agent.environment import NegotiationEnv def main(): with open("configs/offline_env_config.yaml", "r") as f: @@ -19,10 +23,12 @@ def main(): next_observations = f["next_observations"][:] terminals = f["terminals"][:] - state_size = len(np.unique(np.concatenate((observations, next_observations)))) - action_size = len(np.unique(actions)) + from negotiation_agent.environment import NegotiationEnv + env = NegotiationEnv() + state_size = np.prod(env.observation_space.nvec) # 4 * 3 * 3 = 36 + action_size = env.action_space.n # 9 - agent = QLearningAgent(config["agent_params"], state_size, action_size) + agent = QLearningAgent(config["agent"], state_size, action_size) # config["agent"]로 수정 num_epochs = 10 for epoch in range(num_epochs): @@ -38,12 +44,61 @@ def main(): } agent.learn(batch) - # Save the model + # 모델 저장 (npy 형식) saved_models_dir = "saved_models" os.makedirs(saved_models_dir, exist_ok=True) model_path = os.path.join(saved_models_dir, "q_table.npy") - agent.save_model(model_path) + np.save(model_path, agent.q_table) + + # Q-table을 JSON 형식으로도 저장 + spaces = NegotiationSpaces() + q_table_data = { + "metadata": { + "state_size": int(state_size), + "action_size": int(action_size), + "timestamp": datetime.now().isoformat(), + "training_episodes": int(num_epochs) + }, + "q_values": [] + } + + # 각 상태에 대한 Q-값을 저장 + for state_idx in range(state_size): + state_indices = np.unravel_index(state_idx, env.observation_space.nvec) + state_data = { + "state_idx": int(state_idx), + "state_desc": spaces.get_state_description( + [int(idx) for idx in state_indices] + ), + "actions": [] + } + + # 각 행동에 대한 Q-값을 저장 + for action_idx in range(action_size): + action_data = { + "action_idx": int(action_idx), + "action_desc": spaces.get_action_description(action_idx), + "q_value": float(agent.q_table[state_idx, action_idx]) + } + state_data["actions"].append(action_data) + + # 최적 행동 정보 추가 + optimal_action_idx = int(np.argmax(agent.q_table[state_idx])) + state_data["optimal_action"] = { + "action_idx": optimal_action_idx, + "action_desc": spaces.get_action_description(optimal_action_idx), + "q_value": float(agent.q_table[state_idx, optimal_action_idx]) + } + + q_table_data["q_values"].append(state_data) + + # JSON 파일로 저장 + json_path = os.path.join(saved_models_dir, "q_table.json") + with open(json_path, 'w', encoding='utf-8') as f: + json.dump(q_table_data, f, ensure_ascii=False, indent=2) + print(f"Model saved to {model_path}") + print(f"Q-table JSON saved to {json_path}") if __name__ == "__main__": main() diff --git a/usecases/evaluate_agent_usecase.py b/usecases/evaluate_agent_usecase.py index 2237f7d..7880565 100644 --- a/usecases/evaluate_agent_usecase.py +++ b/usecases/evaluate_agent_usecase.py @@ -1,8 +1,8 @@ from agents.offline_agent import QLearningAgent -from envs.my_custom_env import MyCustomEnv +from negotiation_agent.environment import NegotiationEnv class EvaluateAgentUseCase: - def execute(self, agent: QLearningAgent, env: MyCustomEnv, num_episodes: int): + def execute(self, agent: QLearningAgent, env: NegotiationEnv, num_episodes: int): total_rewards = 0 for _ in range(num_episodes): obs, _ = env.reset()