feat: 학습 및 평가 프로세스 개선

- data_collector.py: JSON 형식 로깅 추가
- train_offline.py: Q-table 저장 형식 개선
- evaluate.py: 평가 지표 상세화
- usecases/: 평가 로직 개선
- tests/: 테스트 케이스 업데이트
main
mgjeon 2025-09-22 16:36:07 +09:00
parent e85490e0ab
commit a81e1d4232
5 changed files with 181 additions and 76 deletions

View File

@ -1,49 +1,74 @@
import h5py
import numpy as np import numpy as np
import yaml import yaml
import json
import os
from datetime import datetime
from envs.my_custom_env import MyCustomEnv from negotiation_agent.environment import NegotiationEnv
from negotiation_agent.spaces import NegotiationSpaces
def main(): def main():
with open("configs/offline_env_config.yaml", "r") as f: with open("configs/offline_env_config.yaml", "r") as f:
config = yaml.safe_load(f) config = yaml.safe_load(f)
env = MyCustomEnv() env = NegotiationEnv()
dataset_path = config["dataset_params"]["path"] spaces = NegotiationSpaces()
num_episodes = 10 num_episodes = 10
max_steps_per_episode = 100 max_steps_per_episode = 100
with h5py.File(dataset_path, 'w') as f: # 데이터를 저장할 리스트
observations = [] episodes_data = []
actions = []
rewards = []
next_observations = []
terminals = []
for episode in range(num_episodes): for episode in range(num_episodes):
obs, _ = env.reset() episode_data = {
for step in range(max_steps_per_episode): "episode_id": episode,
action = env.action_space.sample() "timestamp": datetime.now().isoformat(),
next_obs, reward, terminated, _, _ = env.step(action) "steps": []
}
observations.append(obs) obs, _ = env.reset()
actions.append(action) episode_reward = 0
rewards.append(reward)
next_observations.append(next_obs)
terminals.append(terminated)
obs = next_obs for step in range(max_steps_per_episode):
# 행동 선택 및 환경과 상호작용
action = env.action_space.sample()
next_obs, reward, terminated, _, _ = env.step(action)
episode_reward += reward
if terminated: # 스텝 데이터 저장
break step_data = {
"step": step,
"state": spaces.get_state_description(obs),
"action": spaces.get_action_description(action),
"reward": float(reward),
"next_state": spaces.get_state_description(next_obs),
"current_price": float(env.current_price),
"terminated": terminated
}
episode_data["steps"].append(step_data)
f.create_dataset("observations", data=np.array(observations)) obs = next_obs
f.create_dataset("actions", data=np.array(actions)) if terminated:
f.create_dataset("rewards", data=np.array(rewards)) break
f.create_dataset("next_observations", data=np.array(next_observations))
f.create_dataset("terminals", data=np.array(terminals)) episode_data["total_reward"] = float(episode_reward)
episode_data["num_steps"] = len(episode_data["steps"])
episodes_data.append(episode_data)
# JSON 파일로 저장
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
json_path = f"logs/collected_data_{timestamp}.json"
os.makedirs("logs", exist_ok=True)
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(episodes_data, f, ensure_ascii=False, indent=2)
print(f"Data collected and saved to {json_path}")
print(f"Total episodes: {len(episodes_data)}")
print(f"Average steps per episode: {sum(ep['num_steps'] for ep in episodes_data) / len(episodes_data):.2f}")
print(f"Average reward per episode: {sum(ep['total_reward'] for ep in episodes_data) / len(episodes_data):.2f}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,41 +1,66 @@
from negotiation_agent.environment import NegotiationEnv from negotiation_agent.environment import NegotiationEnv
from negotiation_agent.agent import QLearningAgent from agents.offline_agent import QLearningAgent
import config import yaml
import numpy as np
def evaluate(): def main():
# 환경 설정 로드
with open('configs/offline_env_config.yaml', 'r') as f:
config = yaml.safe_load(f)
# 환경 초기화
env = NegotiationEnv( env = NegotiationEnv(
scenario=config.SCENARIO, scenario=config['env']['scenario'],
target_price=config.TARGET_PRICE, target_price=config['env']['target_price'],
threshold_price=config.THRESHOLD_PRICE, threshold_price=config['env']['threshold_price']
) )
# 에이전트를 생성하되, 학습된 Q-Table을 불러옵니다. # 에이전트 초기화 및 Q-table 로드
agent = QLearningAgent( state_dims = env.observation_space.nvec
state_dims=env.observation_space.nvec, state_size = np.prod(state_dims) # 전체 상태 공간 크기
action_size=env.action_space.n, action_size = env.action_space.n
learning_rate=0, # 평가 시에는 학습하지 않음 agent = QLearningAgent(config['agent'], state_size, action_size)
gamma=0, agent.load_q_table('saved_models/q_table.npy')
epsilon=0, # 평가 시에는 탐험하지 않고 최선의 행동만 선택
)
agent.load_q_table(config.Q_TABLE_SAVE_PATH)
print("--- 학습된 에이전트 평가 시작 ---") print(f"State space size: {state_size}")
state, info = env.reset() print(f"Action space size: {action_size}")
terminated = False print(f"Q-table shape: {agent.q_table.shape}")
total_reward = 0
while not terminated: # 평가 실행
action = agent.get_action(state) num_episodes = 10
state, reward, terminated, truncated, info = env.step(action) total_rewards = []
total_reward += reward
print(f"상태: {state}, 선택한 행동: {action}, 보상: {reward:.4f}")
print("\n✅ 평가 종료!") for episode in range(num_episodes):
print(f"최종 협상 가격: {env.current_price:.2f} (목표가: {env.target_price})") state, _ = env.reset()
print(f"총 보상: {total_reward:.4f}") episode_reward = 0
env.close() done = False
while not done:
# 상태를 인덱스로 변환
state_idx = np.ravel_multi_index(tuple(state), env.observation_space.nvec)
# 최적의 행동 선택
action = np.argmax(agent.q_table[state_idx])
# 환경에서 한 스텝 진행
next_state, reward, done, _, _ = env.step(action)
episode_reward += reward
state = next_state
# 현재 상태 출력
print(f"Episode {episode + 1}")
print(f"State: {env.spaces.get_state_description(state)}")
print(f"Action: {env.spaces.get_action_description(action)}")
print(f"Reward: {reward:.2f}")
print(f"Current Price: {env.current_price:.2f}")
print("--------------------")
total_rewards.append(episode_reward)
print(f"Episode {episode + 1} finished with total reward: {episode_reward:.2f}")
print("========================================")
print(f"Average reward over {num_episodes} episodes: {np.mean(total_rewards):.2f}")
if __name__ == "__main__": if __name__ == "__main__":
evaluate() main()

View File

@ -1,7 +1,7 @@
import unittest import unittest
from agents.offline_agent import QLearningAgent from agents.offline_agent import QLearningAgent
from envs.my_custom_env import MyCustomEnv from negotiation_agent.environment import NegotiationEnv
from usecases.evaluate_agent_usecase import EvaluateAgentUseCase from usecases.evaluate_agent_usecase import EvaluateAgentUseCase
class TestEvaluateAgentUseCase(unittest.TestCase): class TestEvaluateAgentUseCase(unittest.TestCase):
@ -10,7 +10,7 @@ class TestEvaluateAgentUseCase(unittest.TestCase):
self.state_size = 10 self.state_size = 10
self.action_size = 2 self.action_size = 2
self.agent = QLearningAgent(self.agent_params, self.state_size, self.action_size) self.agent = QLearningAgent(self.agent_params, self.state_size, self.action_size)
self.env = MyCustomEnv() self.env = NegotiationEnv()
self.use_case = EvaluateAgentUseCase() self.use_case = EvaluateAgentUseCase()
def test_execute(self): def test_execute(self):

View File

@ -2,8 +2,12 @@ import h5py
import numpy as np import numpy as np
import yaml import yaml
import os import os
import json
from datetime import datetime
from agents.offline_agent import QLearningAgent from agents.offline_agent import QLearningAgent
from negotiation_agent.spaces import NegotiationSpaces
from negotiation_agent.environment import NegotiationEnv
def main(): def main():
with open("configs/offline_env_config.yaml", "r") as f: with open("configs/offline_env_config.yaml", "r") as f:
@ -19,10 +23,12 @@ def main():
next_observations = f["next_observations"][:] next_observations = f["next_observations"][:]
terminals = f["terminals"][:] terminals = f["terminals"][:]
state_size = len(np.unique(np.concatenate((observations, next_observations)))) from negotiation_agent.environment import NegotiationEnv
action_size = len(np.unique(actions)) env = NegotiationEnv()
state_size = np.prod(env.observation_space.nvec) # 4 * 3 * 3 = 36
action_size = env.action_space.n # 9
agent = QLearningAgent(config["agent_params"], state_size, action_size) agent = QLearningAgent(config["agent"], state_size, action_size) # config["agent"]로 수정
num_epochs = 10 num_epochs = 10
for epoch in range(num_epochs): for epoch in range(num_epochs):
@ -38,12 +44,61 @@ def main():
} }
agent.learn(batch) agent.learn(batch)
# Save the model # 모델 저장 (npy 형식)
saved_models_dir = "saved_models" saved_models_dir = "saved_models"
os.makedirs(saved_models_dir, exist_ok=True) os.makedirs(saved_models_dir, exist_ok=True)
model_path = os.path.join(saved_models_dir, "q_table.npy") model_path = os.path.join(saved_models_dir, "q_table.npy")
agent.save_model(model_path) np.save(model_path, agent.q_table)
# Q-table을 JSON 형식으로도 저장
spaces = NegotiationSpaces()
q_table_data = {
"metadata": {
"state_size": int(state_size),
"action_size": int(action_size),
"timestamp": datetime.now().isoformat(),
"training_episodes": int(num_epochs)
},
"q_values": []
}
# 각 상태에 대한 Q-값을 저장
for state_idx in range(state_size):
state_indices = np.unravel_index(state_idx, env.observation_space.nvec)
state_data = {
"state_idx": int(state_idx),
"state_desc": spaces.get_state_description(
[int(idx) for idx in state_indices]
),
"actions": []
}
# 각 행동에 대한 Q-값을 저장
for action_idx in range(action_size):
action_data = {
"action_idx": int(action_idx),
"action_desc": spaces.get_action_description(action_idx),
"q_value": float(agent.q_table[state_idx, action_idx])
}
state_data["actions"].append(action_data)
# 최적 행동 정보 추가
optimal_action_idx = int(np.argmax(agent.q_table[state_idx]))
state_data["optimal_action"] = {
"action_idx": optimal_action_idx,
"action_desc": spaces.get_action_description(optimal_action_idx),
"q_value": float(agent.q_table[state_idx, optimal_action_idx])
}
q_table_data["q_values"].append(state_data)
# JSON 파일로 저장
json_path = os.path.join(saved_models_dir, "q_table.json")
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(q_table_data, f, ensure_ascii=False, indent=2)
print(f"Model saved to {model_path}") print(f"Model saved to {model_path}")
print(f"Q-table JSON saved to {json_path}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,8 +1,8 @@
from agents.offline_agent import QLearningAgent from agents.offline_agent import QLearningAgent
from envs.my_custom_env import MyCustomEnv from negotiation_agent.environment import NegotiationEnv
class EvaluateAgentUseCase: class EvaluateAgentUseCase:
def execute(self, agent: QLearningAgent, env: MyCustomEnv, num_episodes: int): def execute(self, agent: QLearningAgent, env: NegotiationEnv, num_episodes: int):
total_rewards = 0 total_rewards = 0
for _ in range(num_episodes): for _ in range(num_episodes):
obs, _ = env.reset() obs, _ = env.reset()