feat: 학습 및 평가 프로세스 개선
- data_collector.py: JSON 형식 로깅 추가 - train_offline.py: Q-table 저장 형식 개선 - evaluate.py: 평가 지표 상세화 - usecases/: 평가 로직 개선 - tests/: 테스트 케이스 업데이트main
parent
e85490e0ab
commit
a81e1d4232
|
|
@ -1,49 +1,74 @@
|
||||||
import h5py
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import yaml
|
import yaml
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from envs.my_custom_env import MyCustomEnv
|
from negotiation_agent.environment import NegotiationEnv
|
||||||
|
from negotiation_agent.spaces import NegotiationSpaces
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
with open("configs/offline_env_config.yaml", "r") as f:
|
with open("configs/offline_env_config.yaml", "r") as f:
|
||||||
config = yaml.safe_load(f)
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
env = MyCustomEnv()
|
env = NegotiationEnv()
|
||||||
dataset_path = config["dataset_params"]["path"]
|
spaces = NegotiationSpaces()
|
||||||
|
|
||||||
num_episodes = 10
|
num_episodes = 10
|
||||||
max_steps_per_episode = 100
|
max_steps_per_episode = 100
|
||||||
|
|
||||||
with h5py.File(dataset_path, 'w') as f:
|
# 데이터를 저장할 리스트
|
||||||
observations = []
|
episodes_data = []
|
||||||
actions = []
|
|
||||||
rewards = []
|
|
||||||
next_observations = []
|
|
||||||
terminals = []
|
|
||||||
|
|
||||||
for episode in range(num_episodes):
|
for episode in range(num_episodes):
|
||||||
obs, _ = env.reset()
|
episode_data = {
|
||||||
for step in range(max_steps_per_episode):
|
"episode_id": episode,
|
||||||
action = env.action_space.sample()
|
"timestamp": datetime.now().isoformat(),
|
||||||
next_obs, reward, terminated, _, _ = env.step(action)
|
"steps": []
|
||||||
|
}
|
||||||
|
|
||||||
observations.append(obs)
|
obs, _ = env.reset()
|
||||||
actions.append(action)
|
episode_reward = 0
|
||||||
rewards.append(reward)
|
|
||||||
next_observations.append(next_obs)
|
|
||||||
terminals.append(terminated)
|
|
||||||
|
|
||||||
obs = next_obs
|
for step in range(max_steps_per_episode):
|
||||||
|
# 행동 선택 및 환경과 상호작용
|
||||||
|
action = env.action_space.sample()
|
||||||
|
next_obs, reward, terminated, _, _ = env.step(action)
|
||||||
|
episode_reward += reward
|
||||||
|
|
||||||
if terminated:
|
# 스텝 데이터 저장
|
||||||
break
|
step_data = {
|
||||||
|
"step": step,
|
||||||
|
"state": spaces.get_state_description(obs),
|
||||||
|
"action": spaces.get_action_description(action),
|
||||||
|
"reward": float(reward),
|
||||||
|
"next_state": spaces.get_state_description(next_obs),
|
||||||
|
"current_price": float(env.current_price),
|
||||||
|
"terminated": terminated
|
||||||
|
}
|
||||||
|
episode_data["steps"].append(step_data)
|
||||||
|
|
||||||
f.create_dataset("observations", data=np.array(observations))
|
obs = next_obs
|
||||||
f.create_dataset("actions", data=np.array(actions))
|
if terminated:
|
||||||
f.create_dataset("rewards", data=np.array(rewards))
|
break
|
||||||
f.create_dataset("next_observations", data=np.array(next_observations))
|
|
||||||
f.create_dataset("terminals", data=np.array(terminals))
|
episode_data["total_reward"] = float(episode_reward)
|
||||||
|
episode_data["num_steps"] = len(episode_data["steps"])
|
||||||
|
episodes_data.append(episode_data)
|
||||||
|
|
||||||
|
# JSON 파일로 저장
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
json_path = f"logs/collected_data_{timestamp}.json"
|
||||||
|
os.makedirs("logs", exist_ok=True)
|
||||||
|
|
||||||
|
with open(json_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(episodes_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"Data collected and saved to {json_path}")
|
||||||
|
print(f"Total episodes: {len(episodes_data)}")
|
||||||
|
print(f"Average steps per episode: {sum(ep['num_steps'] for ep in episodes_data) / len(episodes_data):.2f}")
|
||||||
|
print(f"Average reward per episode: {sum(ep['total_reward'] for ep in episodes_data) / len(episodes_data):.2f}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
83
evaluate.py
83
evaluate.py
|
|
@ -1,41 +1,66 @@
|
||||||
from negotiation_agent.environment import NegotiationEnv
|
from negotiation_agent.environment import NegotiationEnv
|
||||||
from negotiation_agent.agent import QLearningAgent
|
from agents.offline_agent import QLearningAgent
|
||||||
import config
|
import yaml
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def evaluate():
|
def main():
|
||||||
|
# 환경 설정 로드
|
||||||
|
with open('configs/offline_env_config.yaml', 'r') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# 환경 초기화
|
||||||
env = NegotiationEnv(
|
env = NegotiationEnv(
|
||||||
scenario=config.SCENARIO,
|
scenario=config['env']['scenario'],
|
||||||
target_price=config.TARGET_PRICE,
|
target_price=config['env']['target_price'],
|
||||||
threshold_price=config.THRESHOLD_PRICE,
|
threshold_price=config['env']['threshold_price']
|
||||||
)
|
)
|
||||||
|
|
||||||
# 에이전트를 생성하되, 학습된 Q-Table을 불러옵니다.
|
# 에이전트 초기화 및 Q-table 로드
|
||||||
agent = QLearningAgent(
|
state_dims = env.observation_space.nvec
|
||||||
state_dims=env.observation_space.nvec,
|
state_size = np.prod(state_dims) # 전체 상태 공간 크기
|
||||||
action_size=env.action_space.n,
|
action_size = env.action_space.n
|
||||||
learning_rate=0, # 평가 시에는 학습하지 않음
|
agent = QLearningAgent(config['agent'], state_size, action_size)
|
||||||
gamma=0,
|
agent.load_q_table('saved_models/q_table.npy')
|
||||||
epsilon=0, # 평가 시에는 탐험하지 않고 최선의 행동만 선택
|
|
||||||
)
|
|
||||||
agent.load_q_table(config.Q_TABLE_SAVE_PATH)
|
|
||||||
|
|
||||||
print("--- 학습된 에이전트 평가 시작 ---")
|
print(f"State space size: {state_size}")
|
||||||
state, info = env.reset()
|
print(f"Action space size: {action_size}")
|
||||||
terminated = False
|
print(f"Q-table shape: {agent.q_table.shape}")
|
||||||
total_reward = 0
|
|
||||||
|
|
||||||
while not terminated:
|
# 평가 실행
|
||||||
action = agent.get_action(state)
|
num_episodes = 10
|
||||||
state, reward, terminated, truncated, info = env.step(action)
|
total_rewards = []
|
||||||
total_reward += reward
|
|
||||||
print(f"상태: {state}, 선택한 행동: {action}, 보상: {reward:.4f}")
|
|
||||||
|
|
||||||
print("\n✅ 평가 종료!")
|
for episode in range(num_episodes):
|
||||||
print(f"최종 협상 가격: {env.current_price:.2f} (목표가: {env.target_price})")
|
state, _ = env.reset()
|
||||||
print(f"총 보상: {total_reward:.4f}")
|
episode_reward = 0
|
||||||
env.close()
|
done = False
|
||||||
|
|
||||||
|
while not done:
|
||||||
|
# 상태를 인덱스로 변환
|
||||||
|
state_idx = np.ravel_multi_index(tuple(state), env.observation_space.nvec)
|
||||||
|
# 최적의 행동 선택
|
||||||
|
action = np.argmax(agent.q_table[state_idx])
|
||||||
|
|
||||||
|
# 환경에서 한 스텝 진행
|
||||||
|
next_state, reward, done, _, _ = env.step(action)
|
||||||
|
episode_reward += reward
|
||||||
|
state = next_state
|
||||||
|
|
||||||
|
# 현재 상태 출력
|
||||||
|
print(f"Episode {episode + 1}")
|
||||||
|
print(f"State: {env.spaces.get_state_description(state)}")
|
||||||
|
print(f"Action: {env.spaces.get_action_description(action)}")
|
||||||
|
print(f"Reward: {reward:.2f}")
|
||||||
|
print(f"Current Price: {env.current_price:.2f}")
|
||||||
|
print("--------------------")
|
||||||
|
|
||||||
|
total_rewards.append(episode_reward)
|
||||||
|
print(f"Episode {episode + 1} finished with total reward: {episode_reward:.2f}")
|
||||||
|
print("========================================")
|
||||||
|
|
||||||
|
print(f"Average reward over {num_episodes} episodes: {np.mean(total_rewards):.2f}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
evaluate()
|
main()
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from agents.offline_agent import QLearningAgent
|
from agents.offline_agent import QLearningAgent
|
||||||
from envs.my_custom_env import MyCustomEnv
|
from negotiation_agent.environment import NegotiationEnv
|
||||||
from usecases.evaluate_agent_usecase import EvaluateAgentUseCase
|
from usecases.evaluate_agent_usecase import EvaluateAgentUseCase
|
||||||
|
|
||||||
class TestEvaluateAgentUseCase(unittest.TestCase):
|
class TestEvaluateAgentUseCase(unittest.TestCase):
|
||||||
|
|
@ -10,7 +10,7 @@ class TestEvaluateAgentUseCase(unittest.TestCase):
|
||||||
self.state_size = 10
|
self.state_size = 10
|
||||||
self.action_size = 2
|
self.action_size = 2
|
||||||
self.agent = QLearningAgent(self.agent_params, self.state_size, self.action_size)
|
self.agent = QLearningAgent(self.agent_params, self.state_size, self.action_size)
|
||||||
self.env = MyCustomEnv()
|
self.env = NegotiationEnv()
|
||||||
self.use_case = EvaluateAgentUseCase()
|
self.use_case = EvaluateAgentUseCase()
|
||||||
|
|
||||||
def test_execute(self):
|
def test_execute(self):
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,12 @@ import h5py
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import yaml
|
import yaml
|
||||||
import os
|
import os
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from agents.offline_agent import QLearningAgent
|
from agents.offline_agent import QLearningAgent
|
||||||
|
from negotiation_agent.spaces import NegotiationSpaces
|
||||||
|
from negotiation_agent.environment import NegotiationEnv
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
with open("configs/offline_env_config.yaml", "r") as f:
|
with open("configs/offline_env_config.yaml", "r") as f:
|
||||||
|
|
@ -19,10 +23,12 @@ def main():
|
||||||
next_observations = f["next_observations"][:]
|
next_observations = f["next_observations"][:]
|
||||||
terminals = f["terminals"][:]
|
terminals = f["terminals"][:]
|
||||||
|
|
||||||
state_size = len(np.unique(np.concatenate((observations, next_observations))))
|
from negotiation_agent.environment import NegotiationEnv
|
||||||
action_size = len(np.unique(actions))
|
env = NegotiationEnv()
|
||||||
|
state_size = np.prod(env.observation_space.nvec) # 4 * 3 * 3 = 36
|
||||||
|
action_size = env.action_space.n # 9
|
||||||
|
|
||||||
agent = QLearningAgent(config["agent_params"], state_size, action_size)
|
agent = QLearningAgent(config["agent"], state_size, action_size) # config["agent"]로 수정
|
||||||
|
|
||||||
num_epochs = 10
|
num_epochs = 10
|
||||||
for epoch in range(num_epochs):
|
for epoch in range(num_epochs):
|
||||||
|
|
@ -38,12 +44,61 @@ def main():
|
||||||
}
|
}
|
||||||
agent.learn(batch)
|
agent.learn(batch)
|
||||||
|
|
||||||
# Save the model
|
# 모델 저장 (npy 형식)
|
||||||
saved_models_dir = "saved_models"
|
saved_models_dir = "saved_models"
|
||||||
os.makedirs(saved_models_dir, exist_ok=True)
|
os.makedirs(saved_models_dir, exist_ok=True)
|
||||||
model_path = os.path.join(saved_models_dir, "q_table.npy")
|
model_path = os.path.join(saved_models_dir, "q_table.npy")
|
||||||
agent.save_model(model_path)
|
np.save(model_path, agent.q_table)
|
||||||
|
|
||||||
|
# Q-table을 JSON 형식으로도 저장
|
||||||
|
spaces = NegotiationSpaces()
|
||||||
|
q_table_data = {
|
||||||
|
"metadata": {
|
||||||
|
"state_size": int(state_size),
|
||||||
|
"action_size": int(action_size),
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
|
"training_episodes": int(num_epochs)
|
||||||
|
},
|
||||||
|
"q_values": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# 각 상태에 대한 Q-값을 저장
|
||||||
|
for state_idx in range(state_size):
|
||||||
|
state_indices = np.unravel_index(state_idx, env.observation_space.nvec)
|
||||||
|
state_data = {
|
||||||
|
"state_idx": int(state_idx),
|
||||||
|
"state_desc": spaces.get_state_description(
|
||||||
|
[int(idx) for idx in state_indices]
|
||||||
|
),
|
||||||
|
"actions": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# 각 행동에 대한 Q-값을 저장
|
||||||
|
for action_idx in range(action_size):
|
||||||
|
action_data = {
|
||||||
|
"action_idx": int(action_idx),
|
||||||
|
"action_desc": spaces.get_action_description(action_idx),
|
||||||
|
"q_value": float(agent.q_table[state_idx, action_idx])
|
||||||
|
}
|
||||||
|
state_data["actions"].append(action_data)
|
||||||
|
|
||||||
|
# 최적 행동 정보 추가
|
||||||
|
optimal_action_idx = int(np.argmax(agent.q_table[state_idx]))
|
||||||
|
state_data["optimal_action"] = {
|
||||||
|
"action_idx": optimal_action_idx,
|
||||||
|
"action_desc": spaces.get_action_description(optimal_action_idx),
|
||||||
|
"q_value": float(agent.q_table[state_idx, optimal_action_idx])
|
||||||
|
}
|
||||||
|
|
||||||
|
q_table_data["q_values"].append(state_data)
|
||||||
|
|
||||||
|
# JSON 파일로 저장
|
||||||
|
json_path = os.path.join(saved_models_dir, "q_table.json")
|
||||||
|
with open(json_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(q_table_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
print(f"Model saved to {model_path}")
|
print(f"Model saved to {model_path}")
|
||||||
|
print(f"Q-table JSON saved to {json_path}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
from agents.offline_agent import QLearningAgent
|
from agents.offline_agent import QLearningAgent
|
||||||
from envs.my_custom_env import MyCustomEnv
|
from negotiation_agent.environment import NegotiationEnv
|
||||||
|
|
||||||
class EvaluateAgentUseCase:
|
class EvaluateAgentUseCase:
|
||||||
def execute(self, agent: QLearningAgent, env: MyCustomEnv, num_episodes: int):
|
def execute(self, agent: QLearningAgent, env: NegotiationEnv, num_episodes: int):
|
||||||
total_rewards = 0
|
total_rewards = 0
|
||||||
for _ in range(num_episodes):
|
for _ in range(num_episodes):
|
||||||
obs, _ = env.reset()
|
obs, _ = env.reset()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue