mirror of
https://github.com/linyiLYi/street-fighter-ai.git
synced 2025-04-03 22:50:43 +00:00
hyperparameter
This commit is contained in:
parent
ded261ba69
commit
570d5bbe5c
Binary file not shown.
@ -1,3 +1,4 @@
|
||||
import os
|
||||
import time
|
||||
|
||||
import retro
|
||||
@ -5,6 +6,9 @@ from stable_baselines3.common.monitor import Monitor
|
||||
|
||||
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
|
||||
|
||||
LOG_DIR = 'logs/'
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
|
||||
def make_env(game, state):
|
||||
def _init():
|
||||
env = retro.make(
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,4 @@
|
||||
import math
|
||||
import collections
|
||||
|
||||
import gym
|
||||
@ -13,7 +14,9 @@ class StreetFighterCustomWrapper(gym.Wrapper):
|
||||
self.num_frames = 3
|
||||
self.frame_stack = collections.deque(maxlen=self.num_frames)
|
||||
|
||||
self.reward_coeff = 3
|
||||
self.reward_coeff = 3.0
|
||||
|
||||
self.total_timesteps = 0
|
||||
|
||||
self.full_hp = 176
|
||||
self.prev_player_health = self.full_hp
|
||||
@ -37,6 +40,8 @@ class StreetFighterCustomWrapper(gym.Wrapper):
|
||||
observation = self.env.reset()
|
||||
self.prev_player_health = self.full_hp
|
||||
self.prev_oppont_health = self.full_hp
|
||||
|
||||
self.total_timesteps = 0
|
||||
|
||||
# Clear the frame stack and add the first observation [num_frames] times
|
||||
self.frame_stack.clear()
|
||||
@ -50,20 +55,24 @@ class StreetFighterCustomWrapper(gym.Wrapper):
|
||||
obs, _reward, _done, info = self.env.step(action)
|
||||
curr_player_health = info['health']
|
||||
curr_oppont_health = info['enemy_health']
|
||||
|
||||
self.total_timesteps += 1
|
||||
|
||||
# Game is over and player loses.
|
||||
if curr_player_health < 0:
|
||||
custom_reward = -curr_oppont_health # Use the remaining health points of opponent as penalty.
|
||||
custom_reward = -math.pow(self.full_hp, (curr_oppont_health + 1) / (self.full_hp + 1)) # Use the remaining health points of opponent as penalty.
|
||||
# If the opponent also has negative health points, it's a even game and the reward is +1.
|
||||
custom_done = True
|
||||
|
||||
# Game is over and player wins.
|
||||
elif curr_oppont_health < 0:
|
||||
custom_reward = curr_player_health * self.reward_coeff # Use the remaining health points of player as reward.
|
||||
# custom_reward = curr_player_health * self.reward_coeff # Use the remaining health points of player as reward.
|
||||
# Multiply by reward_coeff to make the reward larger than the penalty to avoid cowardice of agent.
|
||||
|
||||
custom_reward = math.pow(self.full_hp, (5940 - self.total_timesteps) / 5940) * self.reward_coeff # Use the remaining time steps as reward.
|
||||
custom_done = True
|
||||
|
||||
# While the fighting is still going on.
|
||||
# While the fighting is still going on
|
||||
else:
|
||||
custom_reward = self.reward_coeff * (self.prev_oppont_health - curr_oppont_health) - (self.prev_player_health - curr_player_health)
|
||||
self.prev_player_health = curr_player_health
|
||||
@ -75,5 +84,5 @@ class StreetFighterCustomWrapper(gym.Wrapper):
|
||||
custom_done = False
|
||||
|
||||
# Max reward is 6 * full_hp = 1054 (damage * 3 + winning_reward * 3)
|
||||
return self._preprocess_observation(obs), custom_reward, custom_done, info
|
||||
return self._preprocess_observation(obs), 0.001 * custom_reward, custom_done, info # reward normalization
|
||||
|
@ -20,30 +20,44 @@ def make_env(game, state):
|
||||
game = "StreetFighterIISpecialChampionEdition-Genesis"
|
||||
state_stages = [
|
||||
"Champion.Level1.RyuVsGuile",
|
||||
"Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 | -20.4
|
||||
"ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
|
||||
"Champion.Level2.ChunLiVsKen",
|
||||
"Champion.Level3.ChunLiVsChunLi",
|
||||
"Champion.Level4.ChunLiVsZangief",
|
||||
"Champion.Level5.ChunLiVsDhalsim",
|
||||
"Champion.Level6.ChunLiVsRyu",
|
||||
"Champion.Level7.ChunLiVsEHonda",
|
||||
"Champion.Level8.ChunLiVsBlanka",
|
||||
"Champion.Level9.ChunLiVsBalrog",
|
||||
"Champion.Level10.ChunLiVsVega",
|
||||
"Champion.Level11.ChunLiVsSagat",
|
||||
"Champion.Level12.ChunLiVsBison"
|
||||
# Add other stages as necessary
|
||||
"Champion.Level2.RyuVsKen",
|
||||
"Champion.Level3.RyuVsChunLi",
|
||||
"Champion.Level4.RyuVsZangief",
|
||||
"Champion.Level5.RyuVsDhalsim",
|
||||
"Champion.Level6.RyuVsRyu",
|
||||
"Champion.Level7.RyuVsEHonda",
|
||||
"Champion.Level8.RyuVsBlanka",
|
||||
"Champion.Level9.RyuVsBalrog",
|
||||
"Champion.Level10.RyuVsVega",
|
||||
"Champion.Level11.RyuVsSagat",
|
||||
"Champion.Level12.RyuVsBison"
|
||||
]
|
||||
# state_stages = [
|
||||
# "Champion.Level1.RyuVsGuile",
|
||||
# "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 | -20.4
|
||||
# "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
|
||||
# "Champion.Level2.ChunLiVsKen",
|
||||
# "Champion.Level3.ChunLiVsChunLi",
|
||||
# "Champion.Level4.ChunLiVsZangief",
|
||||
# "Champion.Level5.ChunLiVsDhalsim",
|
||||
# "Champion.Level6.ChunLiVsRyu",
|
||||
# "Champion.Level7.ChunLiVsEHonda",
|
||||
# "Champion.Level8.ChunLiVsBlanka",
|
||||
# "Champion.Level9.ChunLiVsBalrog",
|
||||
# "Champion.Level10.ChunLiVsVega",
|
||||
# "Champion.Level11.ChunLiVsSagat",
|
||||
# "Champion.Level12.ChunLiVsBison"
|
||||
# # Add other stages as necessary
|
||||
# ]
|
||||
|
||||
env = make_env(game, state_stages[0])()
|
||||
env = make_env(game, state_stages[11])()
|
||||
|
||||
model = PPO(
|
||||
"CnnPolicy",
|
||||
env,
|
||||
verbose=1
|
||||
)
|
||||
model_path = r"trained_models_level_1/ppo_ryu_000000_steps"
|
||||
model_path = r"trained_models_ryu_level_1_time_reward_small_random/ppo_ryu_2600000_steps"
|
||||
model.load(model_path)
|
||||
# Average reward for optuna/trial_1_best_model: -82.3
|
||||
# Average reward for optuna/trial_9_best_model: 36.7 | -86.23
|
||||
@ -60,6 +74,7 @@ for _ in range(num_episodes):
|
||||
obs = env.reset()
|
||||
total_reward = 0
|
||||
while not done:
|
||||
# while True:
|
||||
timestamp = time.time()
|
||||
action, _states = model.predict(obs)
|
||||
obs, reward, done, info = env.step(action)
|
||||
@ -68,9 +83,9 @@ for _ in range(num_episodes):
|
||||
total_reward += reward
|
||||
print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health']))
|
||||
env.render()
|
||||
time.sleep(0.01)
|
||||
# time.sleep(0.005)
|
||||
print("Total reward: {}".format(total_reward))
|
||||
episode_reward_sum += total_reward
|
||||
|
||||
env.close()
|
||||
print("Average reward for {}: {}".format(model_path, episode_reward_sum/num_episodes))
|
||||
# env.close()
|
||||
# print("Average reward for {}: {}".format(model_path, episode_reward_sum/num_episodes))
|
@ -1,4 +1,5 @@
|
||||
import os
|
||||
import sys
|
||||
import random
|
||||
|
||||
import retro
|
||||
@ -8,7 +9,7 @@ from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
|
||||
|
||||
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
|
||||
|
||||
LOG_DIR = 'logs/'
|
||||
LOG_DIR = 'logs'
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
|
||||
class RandomOpponentChangeCallback(BaseCallback):
|
||||
@ -58,21 +59,36 @@ def main():
|
||||
|
||||
state_stages = [
|
||||
"Champion.Level1.RyuVsGuile",
|
||||
"Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3
|
||||
"Champion.Level2.ChunLiVsKen",
|
||||
"Champion.Level3.ChunLiVsChunLi",
|
||||
"Champion.Level4.ChunLiVsZangief",
|
||||
"Champion.Level5.ChunLiVsDhalsim",
|
||||
"Champion.Level6.ChunLiVsRyu",
|
||||
"Champion.Level7.ChunLiVsEHonda",
|
||||
"Champion.Level8.ChunLiVsBlanka",
|
||||
"Champion.Level9.ChunLiVsBalrog",
|
||||
"Champion.Level10.ChunLiVsVega",
|
||||
"Champion.Level11.ChunLiVsSagat",
|
||||
"Champion.Level12.ChunLiVsBison"
|
||||
# Add other stages as necessary
|
||||
"Champion.Level2.RyuVsKen",
|
||||
"Champion.Level3.RyuVsChunLi",
|
||||
"Champion.Level4.RyuVsZangief",
|
||||
"Champion.Level5.RyuVsDhalsim",
|
||||
"Champion.Level6.RyuVsRyu",
|
||||
"Champion.Level7.RyuVsEHonda",
|
||||
"Champion.Level8.RyuVsBlanka",
|
||||
"Champion.Level9.RyuVsBalrog",
|
||||
"Champion.Level10.RyuVsVega",
|
||||
"Champion.Level11.RyuVsSagat",
|
||||
"Champion.Level12.RyuVsBison"
|
||||
]
|
||||
|
||||
# state_stages = [
|
||||
# "Champion.Level1.RyuVsGuile",
|
||||
# "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3
|
||||
# "Champion.Level2.ChunLiVsKen",
|
||||
# "Champion.Level3.ChunLiVsChunLi",
|
||||
# "Champion.Level4.ChunLiVsZangief",
|
||||
# "Champion.Level5.ChunLiVsDhalsim",
|
||||
# "Champion.Level6.ChunLiVsRyu",
|
||||
# "Champion.Level7.ChunLiVsEHonda",
|
||||
# "Champion.Level8.ChunLiVsBlanka",
|
||||
# "Champion.Level9.ChunLiVsBalrog",
|
||||
# "Champion.Level10.ChunLiVsVega",
|
||||
# "Champion.Level11.ChunLiVsSagat",
|
||||
# "Champion.Level12.ChunLiVsBison"
|
||||
# # Add other stages as necessary
|
||||
# ]
|
||||
|
||||
# state_stages = [
|
||||
# "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
|
||||
# "ChampionX.Level2.ChunLiVsChunLi",
|
||||
@ -103,31 +119,27 @@ def main():
|
||||
n_steps=1024,
|
||||
batch_size=64,
|
||||
learning_rate=1e-4,
|
||||
ent_coef=0.01,
|
||||
clip_range=0.2,
|
||||
gamma=0.95,
|
||||
gae_lambda=0.81322,
|
||||
tensorboard_log="logs/"
|
||||
tensorboard_log="logs"
|
||||
)
|
||||
|
||||
# Set the save directory
|
||||
save_dir = "trained_models_ryu_level_1_reward_x3"
|
||||
save_dir = "trained_models_ryu_level_1_time_reward_small_random"
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
# Load the model from file
|
||||
# model_path = "trained_models/ppo_chunli_1296000_steps.zip"
|
||||
model_path = "trained_models_ryu_level_1_time_reward_small_continue/ppo_ryu_400000_steps.zip"
|
||||
|
||||
# Load model and modify the learning rate and entropy coefficient
|
||||
# custom_objects = {
|
||||
# "learning_rate": 0.0002
|
||||
# }
|
||||
# model = PPO.load(model_path, env=env, device="cuda")#, custom_objects=custom_objects)
|
||||
model = PPO.load(model_path, env=env, device="cuda")#, custom_objects=custom_objects)
|
||||
|
||||
# Set up callbacks
|
||||
# opponent_interval = 35840 # stage_interval * num_envs = total_steps_per_stage
|
||||
opponent_interval = 32768 # stage_interval * num_envs = total_steps_per_stage
|
||||
checkpoint_interval = 200000 # checkpoint_interval * num_envs = total_steps_per_checkpoint (Every 80 rounds)
|
||||
checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path=save_dir, name_prefix="ppo_ryu")
|
||||
# stage_increase_callback = RandomOpponentChangeCallback(state_stages, opponent_interval, save_dir)
|
||||
stage_increase_callback = RandomOpponentChangeCallback(state_stages, opponent_interval, save_dir)
|
||||
|
||||
# model_params = {
|
||||
# 'n_steps': 5,
|
||||
@ -141,12 +153,21 @@ def main():
|
||||
# }
|
||||
# model = A2C('CnnPolicy', env, tensorboard_log='logs/', verbose=1, **model_params, policy_kwargs=dict(optimizer_class=RMSpropTF))
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(10000000), # total_timesteps = stage_interval * num_envs * num_stages (1120 rounds)
|
||||
callback=[checkpoint_callback]#, stage_increase_callback]
|
||||
)
|
||||
env.close()
|
||||
# Writing the training logs from stdout to a file
|
||||
original_stdout = sys.stdout
|
||||
log_file_path = os.path.join(save_dir, "training_log.txt")
|
||||
with open(log_file_path, 'w') as log_file:
|
||||
sys.stdout = log_file
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(10000000), # total_timesteps = stage_interval * num_envs * num_stages (1120 rounds)
|
||||
callback=[checkpoint_callback, stage_increase_callback]
|
||||
)
|
||||
env.close()
|
||||
|
||||
# Restore stdout
|
||||
sys.stdout = original_stdout
|
||||
|
||||
# Save the final model
|
||||
model.save(os.path.join(save_dir, "ppo_sf2_ryu_final.zip"))
|
||||
|
||||
|
68
004_image_stack_ram_based_reward_custom/tune_ppo.py
Normal file
68
004_image_stack_ram_based_reward_custom/tune_ppo.py
Normal file
@ -0,0 +1,68 @@
|
||||
import os
|
||||
|
||||
import retro
|
||||
import optuna
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
from stable_baselines3.common.evaluation import evaluate_policy
|
||||
|
||||
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
|
||||
|
||||
LOG_DIR = 'logs/'
|
||||
OPT_DIR = 'optuna/'
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
os.makedirs(OPT_DIR, exist_ok=True)
|
||||
|
||||
def optimize_ppo(trial):
|
||||
return {
|
||||
'n_steps':trial.suggest_int('n_steps', 512, 2048, log=True),
|
||||
'gamma':trial.suggest_float('gamma', 0.9, 0.9999),
|
||||
'learning_rate':trial.suggest_float('learning_rate', 5e-5, 5e-4, log=True),
|
||||
'gae_lambda':trial.suggest_float('gae_lambda', 0.8, 0.9999)
|
||||
}
|
||||
|
||||
def make_env(game, state):
|
||||
def _init():
|
||||
env = retro.make(
|
||||
game=game,
|
||||
state=state,
|
||||
use_restricted_actions=retro.Actions.FILTERED,
|
||||
obs_type=retro.Observations.IMAGE
|
||||
)
|
||||
env = StreetFighterCustomWrapper(env)
|
||||
return env
|
||||
return _init
|
||||
|
||||
def optimize_agent(trial):
|
||||
game = "StreetFighterIISpecialChampionEdition-Genesis"
|
||||
state = "Champion.Level1.ChunLiVsGuile"#"ChampionX.Level1.ChunLiVsKen"
|
||||
|
||||
try:
|
||||
model_params = optimize_ppo(trial)
|
||||
|
||||
# Create environment
|
||||
env = make_env(game, state)()
|
||||
env = Monitor(env, LOG_DIR)
|
||||
|
||||
# Create algo
|
||||
model = PPO('CnnPolicy', env, verbose=1, **model_params)
|
||||
model.learn(total_timesteps=500000)
|
||||
|
||||
# Evaluate model
|
||||
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=30, deterministic=False)
|
||||
env.close()
|
||||
|
||||
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
|
||||
model.save(SAVE_PATH)
|
||||
|
||||
return mean_reward
|
||||
|
||||
except Exception as e:
|
||||
return -1
|
||||
|
||||
# Creating the experiment
|
||||
study = optuna.create_study(direction='maximize')
|
||||
study.optimize(optimize_agent, n_trials=10, n_jobs=1)
|
||||
|
||||
print(study.best_params)
|
||||
print(study.best_trial)
|
Binary file not shown.
51
005_rgb_stack_ram_based_reward_time_penalty/check_reward.py
Normal file
51
005_rgb_stack_ram_based_reward_time_penalty/check_reward.py
Normal file
@ -0,0 +1,51 @@
|
||||
import os
|
||||
import time
|
||||
|
||||
import retro
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
|
||||
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
|
||||
|
||||
LOG_DIR = 'logs/'
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
|
||||
def make_env(game, state):
|
||||
def _init():
|
||||
env = retro.make(
|
||||
game=game,
|
||||
state=state,
|
||||
use_restricted_actions=retro.Actions.FILTERED,
|
||||
obs_type=retro.Observations.IMAGE
|
||||
)
|
||||
env = StreetFighterCustomWrapper(env)
|
||||
return env
|
||||
return _init
|
||||
|
||||
game = "StreetFighterIISpecialChampionEdition-Genesis"
|
||||
state = "Champion.Level1.RyuVsGuile"
|
||||
|
||||
env = make_env(game, state)()
|
||||
env = Monitor(env, 'logs/')
|
||||
|
||||
num_episodes = 30
|
||||
episode_reward_sum = 0
|
||||
for _ in range(num_episodes):
|
||||
done = False
|
||||
obs = env.reset()
|
||||
total_reward = 0
|
||||
while not done:
|
||||
timestamp = time.time()
|
||||
obs, reward, done, info = env.step(env.action_space.sample())
|
||||
|
||||
# Note that if player wins but only has 0 HP left, the winning reward is still 0, so it won't be printed.
|
||||
if reward != 0:
|
||||
total_reward += reward
|
||||
print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health']))
|
||||
env.render()
|
||||
# time.sleep(0.005)
|
||||
|
||||
print("Total reward: {}".format(total_reward))
|
||||
episode_reward_sum += total_reward
|
||||
|
||||
env.close()
|
||||
print("Average reward for random strategy: {}".format(episode_reward_sum/num_episodes))
|
24
005_rgb_stack_ram_based_reward_time_penalty/custom_cnn.py
Normal file
24
005_rgb_stack_ram_based_reward_time_penalty/custom_cnn.py
Normal file
@ -0,0 +1,24 @@
|
||||
import gym
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
|
||||
|
||||
# Custom feature extractor (CNN)
|
||||
class CustomCNN(BaseFeaturesExtractor):
|
||||
def __init__(self, observation_space: gym.Space):
|
||||
super(CustomCNN, self).__init__(observation_space, features_dim=512)
|
||||
self.cnn = nn.Sequential(
|
||||
nn.Conv2d(4, 32, kernel_size=5, stride=2, padding=0),
|
||||
nn.ReLU(),
|
||||
nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=0),
|
||||
nn.ReLU(),
|
||||
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
|
||||
nn.ReLU(),
|
||||
nn.Flatten(),
|
||||
nn.Linear(16384, self.features_dim),
|
||||
nn.ReLU()
|
||||
)
|
||||
|
||||
def forward(self, observations: torch.Tensor) -> torch.Tensor:
|
||||
return self.cnn(observations)
|
||||
|
52
005_rgb_stack_ram_based_reward_time_penalty/evaluate.py
Normal file
52
005_rgb_stack_ram_based_reward_time_penalty/evaluate.py
Normal file
@ -0,0 +1,52 @@
|
||||
import retro
|
||||
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.vec_env import DummyVecEnv
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
from stable_baselines3.common.evaluation import evaluate_policy
|
||||
|
||||
from custom_cnn import CustomCNN
|
||||
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
|
||||
|
||||
def make_env(game, state):
|
||||
def _init():
|
||||
env = retro.make(
|
||||
game=game,
|
||||
state=state,
|
||||
use_restricted_actions=retro.Actions.FILTERED,
|
||||
obs_type=retro.Observations.IMAGE
|
||||
)
|
||||
env = StreetFighterCustomWrapper(env)
|
||||
return env
|
||||
return _init
|
||||
|
||||
game = "StreetFighterIISpecialChampionEdition-Genesis"
|
||||
state_stages = [
|
||||
"Champion.Level1.ChunLiVsGuile",
|
||||
"Champion.Level2.ChunLiVsKen",
|
||||
"Champion.Level3.ChunLiVsChunLi",
|
||||
"Champion.Level4.ChunLiVsZangief",
|
||||
"Champion.Level5.ChunLiVsDhalsim",
|
||||
"Champion.Level6.ChunLiVsRyu",
|
||||
"Champion.Level7.ChunLiVsEHonda",
|
||||
"Champion.Level8.ChunLiVsBlanka",
|
||||
"Champion.Level9.ChunLiVsBalrog",
|
||||
"Champion.Level10.ChunLiVsVega",
|
||||
"Champion.Level11.ChunLiVsSagat",
|
||||
"Champion.Level12.ChunLiVsBison"
|
||||
# Add other stages as necessary
|
||||
]
|
||||
|
||||
env = make_env(game, state_stages[0])()
|
||||
|
||||
# Wrap the environment
|
||||
# env = Monitor(env, 'logs/')
|
||||
|
||||
policy_kwargs = {'features_extractor_class': CustomCNN}
|
||||
model = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs)
|
||||
|
||||
model = PPO.load(r"dummy_model_ppo_chunli")
|
||||
# model.load(r"trained_models/ppo_chunli_864000_steps")
|
||||
|
||||
mean_reward, std_reward = evaluate_policy(model, env, render=True, n_eval_episodes=10, deterministic=False, return_episode_rewards=True)
|
||||
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
72
005_rgb_stack_ram_based_reward_time_penalty/logs/monitor.csv
Normal file
72
005_rgb_stack_ram_based_reward_time_penalty/logs/monitor.csv
Normal file
@ -0,0 +1,72 @@
|
||||
#{"t_start": 1680493345.1918354, "env_id": null}
|
||||
r,l,t
|
||||
121.963397,1218,4.678451
|
||||
150.73549,1321,10.670471
|
||||
-68.875587,1685,16.915831
|
||||
110.079359,1992,23.723121
|
||||
-142.422045,2475,30.929999
|
||||
25.618847,2409,37.817851
|
||||
-10.399282,2048,44.290086
|
||||
103.210152,2378,51.321482
|
||||
-10.83336,1410,56.861604
|
||||
-109.50968,2320,63.936125
|
||||
-56.133868,1884,70.48871
|
||||
-1.263258,3856,83.55072
|
||||
137.096617,1955,89.782694
|
||||
-173.590936,2482,96.620559
|
||||
-32.06399,2622,103.507519
|
||||
-33.546645,2359,110.155689
|
||||
247.618832,631,114.254022
|
||||
34.18776,1988,121.014232
|
||||
-16.0,2372,128.411761
|
||||
142.528257,1504,131.040915
|
||||
-35.511721,2100,141.725583
|
||||
210.669563,1386,144.090119
|
||||
106.166972,1756,150.146413
|
||||
-291.452157,1771,156.139224
|
||||
-103.271598,2993,163.884984
|
||||
-3.09159,2881,176.148215
|
||||
-69.312432,2968,184.263354
|
||||
11.966888,3420,196.801994
|
||||
-72.699861,2025,203.564955
|
||||
-148.405698,1736,209.933663
|
||||
-6.505259,2863,217.596923
|
||||
-45.308495,1642,223.866472
|
||||
-60.133868,1684,230.124691
|
||||
20.320741,1912,236.920414
|
||||
12.850028,3201,245.687548
|
||||
44.331029,2020,252.592099
|
||||
4.970357,2426,260.241496
|
||||
-26.074153,3193,272.014878
|
||||
41.36973,2057,278.669485
|
||||
-58.986068,1951,285.105779
|
||||
-6.22689,3090,293.487389
|
||||
42.518735,2469,304.130861
|
||||
28.699296,1979,310.846102
|
||||
-0.123948,2276,318.259012
|
||||
-130.860871,3166,326.553931
|
||||
-94.074138,2784,337.55811
|
||||
-6.779892,1688,340.079623
|
||||
-2.846787,1842,346.163562
|
||||
-6.846787,2137,352.507397
|
||||
74.03875,2905,364.386266
|
||||
91.493381,1660,370.441151
|
||||
-60.286049,1502,372.846128
|
||||
-127.450026,2030,379.306661
|
||||
-90.028712,1565,385.293074
|
||||
-45.615818,2020,391.765668
|
||||
-49.94699,3488,403.627649
|
||||
-90.632338,2270,410.494766
|
||||
46.011777,3184,419.074744
|
||||
-11.461924,2326,429.595886
|
||||
181.774886,1260,431.709853
|
||||
-40.06399,2863,442.810548
|
||||
-90.86073,2232,449.485116
|
||||
72.549001,2173,455.957089
|
||||
66.832361,1597,461.523406
|
||||
29.003218,1923,464.330122
|
||||
-57.986068,3347,476.034669
|
||||
175.784026,1320,481.626563
|
||||
-328.0,1320,483.822258
|
||||
-81.578734,2842,495.411107
|
||||
72.161772,1818,501.982363
|
Can't render this file because it contains an unexpected character in line 1 and column 3.
|
@ -0,0 +1,88 @@
|
||||
import math
|
||||
import collections
|
||||
|
||||
import gym
|
||||
import numpy as np
|
||||
|
||||
# Custom environment wrapper
|
||||
class StreetFighterCustomWrapper(gym.Wrapper):
|
||||
def __init__(self, env, testing=False):
|
||||
super(StreetFighterCustomWrapper, self).__init__(env)
|
||||
self.env = env
|
||||
|
||||
# Use a deque to store the last 4 frames
|
||||
self.num_frames = 3
|
||||
self.frame_stack = collections.deque(maxlen=self.num_frames)
|
||||
|
||||
self.reward_coeff = 1.0
|
||||
|
||||
self.total_timesteps = 0
|
||||
|
||||
self.full_hp = 176
|
||||
self.prev_player_health = self.full_hp
|
||||
self.prev_oppont_health = self.full_hp
|
||||
|
||||
# Update observation space to include stacked grayscale images
|
||||
self.observation_space = gym.spaces.Box(low=0, high=255, shape=(100, 128, 3), dtype=np.uint8)
|
||||
|
||||
self.testing = testing
|
||||
|
||||
def _preprocess_observation(self, observation):
|
||||
|
||||
# Stack the downsampled frames.
|
||||
self.frame_stack.append(observation[::2, ::2, :])
|
||||
|
||||
# Stack the R, G, B channel of each frame and return the "image".
|
||||
stacked_image = np.stack([frame[:, :, i] for i, frame in enumerate(self.frame_stack)], axis=-1)
|
||||
return stacked_image
|
||||
|
||||
def reset(self):
|
||||
observation = self.env.reset()
|
||||
self.prev_player_health = self.full_hp
|
||||
self.prev_oppont_health = self.full_hp
|
||||
|
||||
self.total_timesteps = 0
|
||||
|
||||
# Clear the frame stack and add the first observation [num_frames] times
|
||||
self.frame_stack.clear()
|
||||
for _ in range(self.num_frames):
|
||||
self.frame_stack.append(observation[::2, ::2, :])
|
||||
|
||||
return np.stack([frame[:, :, i] for i, frame in enumerate(self.frame_stack)], axis=-1)
|
||||
|
||||
def step(self, action):
|
||||
|
||||
obs, _reward, _done, info = self.env.step(action)
|
||||
curr_player_health = info['health']
|
||||
curr_oppont_health = info['enemy_health']
|
||||
|
||||
self.total_timesteps += 1
|
||||
|
||||
# Game is over and player loses.
|
||||
if curr_player_health < 0:
|
||||
custom_reward = -math.pow(self.full_hp, (curr_oppont_health + 1) / (self.full_hp + 1)) # Use the remaining health points of opponent as penalty.
|
||||
# If the opponent also has negative health points, it's a even game and the reward is +1.
|
||||
custom_done = True
|
||||
|
||||
# Game is over and player wins.
|
||||
elif curr_oppont_health < 0:
|
||||
# custom_reward = curr_player_health * self.reward_coeff # Use the remaining health points of player as reward.
|
||||
# Multiply by reward_coeff to make the reward larger than the penalty to avoid cowardice of agent.
|
||||
|
||||
custom_reward = math.pow(self.full_hp, (5940 - self.total_timesteps) / 5940)
|
||||
custom_done = True
|
||||
|
||||
# While the fighting is still going on
|
||||
else:
|
||||
custom_reward = self.reward_coeff * (self.prev_oppont_health - curr_oppont_health) - (self.prev_player_health - curr_player_health)
|
||||
self.prev_player_health = curr_player_health
|
||||
self.prev_oppont_health = curr_oppont_health
|
||||
custom_done = False
|
||||
|
||||
# During testing, the session should always keep going.
|
||||
if self.testing:
|
||||
custom_done = False
|
||||
|
||||
# Max reward is 6 * full_hp = 1054 (damage * 3 + winning_reward * 3)
|
||||
return self._preprocess_observation(obs), custom_reward, custom_done, info
|
||||
|
76
005_rgb_stack_ram_based_reward_time_penalty/test.py
Normal file
76
005_rgb_stack_ram_based_reward_time_penalty/test.py
Normal file
@ -0,0 +1,76 @@
|
||||
import time
|
||||
|
||||
import retro
|
||||
from stable_baselines3 import PPO
|
||||
|
||||
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
|
||||
|
||||
def make_env(game, state):
|
||||
def _init():
|
||||
env = retro.make(
|
||||
game=game,
|
||||
state=state,
|
||||
use_restricted_actions=retro.Actions.FILTERED,
|
||||
obs_type=retro.Observations.IMAGE
|
||||
)
|
||||
env = StreetFighterCustomWrapper(env)
|
||||
return env
|
||||
return _init
|
||||
|
||||
game = "StreetFighterIISpecialChampionEdition-Genesis"
|
||||
state_stages = [
|
||||
"Champion.Level1.RyuVsGuile",
|
||||
"Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 | -20.4
|
||||
"ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
|
||||
"Champion.Level2.ChunLiVsKen",
|
||||
"Champion.Level3.ChunLiVsChunLi",
|
||||
"Champion.Level4.ChunLiVsZangief",
|
||||
"Champion.Level5.ChunLiVsDhalsim",
|
||||
"Champion.Level6.ChunLiVsRyu",
|
||||
"Champion.Level7.ChunLiVsEHonda",
|
||||
"Champion.Level8.ChunLiVsBlanka",
|
||||
"Champion.Level9.ChunLiVsBalrog",
|
||||
"Champion.Level10.ChunLiVsVega",
|
||||
"Champion.Level11.ChunLiVsSagat",
|
||||
"Champion.Level12.ChunLiVsBison"
|
||||
# Add other stages as necessary
|
||||
]
|
||||
|
||||
env = make_env(game, state_stages[0])()
|
||||
|
||||
model = PPO(
|
||||
"CnnPolicy",
|
||||
env,
|
||||
verbose=1
|
||||
)
|
||||
model_path = r"trained_models_ryu_level_1_reward_x3/ppo_ryu_6600000_steps"
|
||||
model.load(model_path)
|
||||
# Average reward for optuna/trial_1_best_model: -82.3
|
||||
# Average reward for optuna/trial_9_best_model: 36.7 | -86.23
|
||||
# Average reward for trained_models/ppo_chunli_5376000_steps: -77.8
|
||||
|
||||
|
||||
obs = env.reset()
|
||||
done = False
|
||||
|
||||
num_episodes = 30
|
||||
episode_reward_sum = 0
|
||||
for _ in range(num_episodes):
|
||||
done = False
|
||||
obs = env.reset()
|
||||
total_reward = 0
|
||||
while True:
|
||||
timestamp = time.time()
|
||||
action, _states = model.predict(obs)
|
||||
obs, reward, done, info = env.step(action)
|
||||
|
||||
if reward != 0:
|
||||
total_reward += reward
|
||||
print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health']))
|
||||
env.render()
|
||||
# time.sleep(0.005)
|
||||
# print("Total reward: {}".format(total_reward))
|
||||
# episode_reward_sum += total_reward
|
||||
|
||||
# env.close()
|
||||
# print("Average reward for {}: {}".format(model_path, episode_reward_sum/num_episodes))
|
151
005_rgb_stack_ram_based_reward_time_penalty/train.py
Normal file
151
005_rgb_stack_ram_based_reward_time_penalty/train.py
Normal file
@ -0,0 +1,151 @@
|
||||
import os
|
||||
import random
|
||||
|
||||
import retro
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
|
||||
|
||||
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
|
||||
|
||||
LOG_DIR = 'logs'
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
|
||||
class RandomOpponentChangeCallback(BaseCallback):
|
||||
def __init__(self, stages, opponent_interval, verbose=0):
|
||||
super(RandomOpponentChangeCallback, self).__init__(verbose)
|
||||
self.stages = stages
|
||||
self.opponent_interval = opponent_interval
|
||||
|
||||
def _on_step(self) -> bool:
|
||||
if self.n_calls % self.opponent_interval == 0:
|
||||
new_state = random.choice(self.stages)
|
||||
print("\nCurrent state:", new_state)
|
||||
self.training_env.env_method("load_state", new_state, indices=None)
|
||||
return True
|
||||
|
||||
# class StageIncreaseCallback(BaseCallback):
|
||||
# def __init__(self, stages, stage_interval, save_dir, verbose=0):
|
||||
# super(StageIncreaseCallback, self).__init__(verbose)
|
||||
# self.stages = stages
|
||||
# self.stage_interval = stage_interval
|
||||
# self.save_dir = save_dir
|
||||
# self.current_stage = 0
|
||||
|
||||
# def _on_step(self) -> bool:
|
||||
# if self.n_calls % self.stage_interval == 0 and self.current_stage < len(self.stages) - 1:
|
||||
# self.current_stage += 1
|
||||
# new_state = self.stages[self.current_stage]
|
||||
# self.training_env.env_method("load_state", new_state, indices=None)
|
||||
# self.model.save(os.path.join(self.save_dir, f"ppo_chunli_stage_{self.current_stage}.zip"))
|
||||
# return True
|
||||
|
||||
def make_env(game, state):
|
||||
def _init():
|
||||
env = retro.make(
|
||||
game=game,
|
||||
state=state,
|
||||
use_restricted_actions=retro.Actions.FILTERED,
|
||||
obs_type=retro.Observations.IMAGE
|
||||
)
|
||||
env = StreetFighterCustomWrapper(env)
|
||||
return env
|
||||
return _init
|
||||
|
||||
def main():
|
||||
# Set up the environment and model
|
||||
game = "StreetFighterIISpecialChampionEdition-Genesis"
|
||||
|
||||
state_stages = [
|
||||
"Champion.Level1.RyuVsGuile",
|
||||
"Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3
|
||||
"Champion.Level2.ChunLiVsKen",
|
||||
"Champion.Level3.ChunLiVsChunLi",
|
||||
"Champion.Level4.ChunLiVsZangief",
|
||||
"Champion.Level5.ChunLiVsDhalsim",
|
||||
"Champion.Level6.ChunLiVsRyu",
|
||||
"Champion.Level7.ChunLiVsEHonda",
|
||||
"Champion.Level8.ChunLiVsBlanka",
|
||||
"Champion.Level9.ChunLiVsBalrog",
|
||||
"Champion.Level10.ChunLiVsVega",
|
||||
"Champion.Level11.ChunLiVsSagat",
|
||||
"Champion.Level12.ChunLiVsBison"
|
||||
# Add other stages as necessary
|
||||
]
|
||||
|
||||
# state_stages = [
|
||||
# "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
|
||||
# "ChampionX.Level2.ChunLiVsChunLi",
|
||||
# "ChampionX.Level3.ChunLiVsZangief",
|
||||
# "ChampionX.Level4.ChunLiVsDhalsim",
|
||||
# "ChampionX.Level5.ChunLiVsRyu",
|
||||
# "ChampionX.Level6.ChunLiVsEHonda",
|
||||
# "ChampionX.Level7.ChunLiVsBlanka",
|
||||
# "ChampionX.Level8.ChunLiVsGuile",
|
||||
# "ChampionX.Level9.ChunLiVsBalrog",
|
||||
# "ChampionX.Level10.ChunLiVsVega",
|
||||
# "ChampionX.Level11.ChunLiVsSagat",
|
||||
# "ChampionX.Level12.ChunLiVsBison"
|
||||
# # Add other stages as necessary
|
||||
# ]
|
||||
# Champion is at difficulty level 4, ChampionX is at difficulty level 8.
|
||||
|
||||
env = make_env(game, state_stages[0])()
|
||||
|
||||
# Warp env in Monitor wrapper to record training progress
|
||||
env = Monitor(env, LOG_DIR)
|
||||
|
||||
model = PPO(
|
||||
"CnnPolicy",
|
||||
env,
|
||||
device="cuda",
|
||||
verbose=1,
|
||||
n_steps=2048,
|
||||
batch_size=64,
|
||||
learning_rate=1e-4,
|
||||
gamma=0.99,
|
||||
tensorboard_log="logs"
|
||||
)
|
||||
|
||||
# Set the save directory
|
||||
save_dir = "trained_models_ryu_level_1_time_reward"
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
# Load the model from file
|
||||
# model_path = "trained_models/ppo_chunli_1296000_steps.zip"
|
||||
|
||||
# Load model and modify the learning rate and entropy coefficient
|
||||
# custom_objects = {
|
||||
# "learning_rate": 0.0002
|
||||
# }
|
||||
# model = PPO.load(model_path, env=env, device="cuda")#, custom_objects=custom_objects)
|
||||
|
||||
# Set up callbacks
|
||||
# opponent_interval = 35840 # stage_interval * num_envs = total_steps_per_stage
|
||||
checkpoint_interval = 200000 # checkpoint_interval * num_envs = total_steps_per_checkpoint (Every 80 rounds)
|
||||
checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path=save_dir, name_prefix="ppo_ryu")
|
||||
# stage_increase_callback = RandomOpponentChangeCallback(state_stages, opponent_interval, save_dir)
|
||||
|
||||
# model_params = {
|
||||
# 'n_steps': 5,
|
||||
# 'gamma': 0.99,
|
||||
# 'gae_lambda':1,
|
||||
# 'learning_rate': 7e-4,
|
||||
# 'vf_coef': 0.5,
|
||||
# 'ent_coef': 0.0,
|
||||
# 'max_grad_norm':0.5,
|
||||
# 'rms_prop_eps':1e-05
|
||||
# }
|
||||
# model = A2C('CnnPolicy', env, tensorboard_log='logs/', verbose=1, **model_params, policy_kwargs=dict(optimizer_class=RMSpropTF))
|
||||
|
||||
model.learn(
|
||||
total_timesteps=int(10000000), # total_timesteps = stage_interval * num_envs * num_stages (1120 rounds)
|
||||
callback=[checkpoint_callback]#, stage_increase_callback]
|
||||
)
|
||||
env.close()
|
||||
|
||||
# Save the final model
|
||||
model.save(os.path.join(save_dir, "ppo_sf2_ryu_final.zip"))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
68
005_rgb_stack_ram_based_reward_time_penalty/tune_ppo.py
Normal file
68
005_rgb_stack_ram_based_reward_time_penalty/tune_ppo.py
Normal file
@ -0,0 +1,68 @@
|
||||
import os
|
||||
|
||||
import retro
|
||||
import optuna
|
||||
from stable_baselines3 import PPO
|
||||
from stable_baselines3.common.monitor import Monitor
|
||||
from stable_baselines3.common.evaluation import evaluate_policy
|
||||
|
||||
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
|
||||
|
||||
LOG_DIR = 'logs/'
|
||||
OPT_DIR = 'optuna/'
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
os.makedirs(OPT_DIR, exist_ok=True)
|
||||
|
||||
def optimize_ppo(trial):
|
||||
return {
|
||||
'n_steps':trial.suggest_int('n_steps', 512, 2048, log=True),
|
||||
'gamma':trial.suggest_float('gamma', 0.9, 0.9999),
|
||||
'learning_rate':trial.suggest_float('learning_rate', 5e-5, 5e-4, log=True),
|
||||
'gae_lambda':trial.suggest_float('gae_lambda', 0.8, 0.9999)
|
||||
}
|
||||
|
||||
def make_env(game, state):
|
||||
def _init():
|
||||
env = retro.make(
|
||||
game=game,
|
||||
state=state,
|
||||
use_restricted_actions=retro.Actions.FILTERED,
|
||||
obs_type=retro.Observations.IMAGE
|
||||
)
|
||||
env = StreetFighterCustomWrapper(env)
|
||||
return env
|
||||
return _init
|
||||
|
||||
def optimize_agent(trial):
|
||||
game = "StreetFighterIISpecialChampionEdition-Genesis"
|
||||
state = "Champion.Level1.ChunLiVsGuile"#"ChampionX.Level1.ChunLiVsKen"
|
||||
|
||||
try:
|
||||
model_params = optimize_ppo(trial)
|
||||
|
||||
# Create environment
|
||||
env = make_env(game, state)()
|
||||
env = Monitor(env, LOG_DIR)
|
||||
|
||||
# Create algo
|
||||
model = PPO('CnnPolicy', env, verbose=1, **model_params)
|
||||
model.learn(total_timesteps=500000)
|
||||
|
||||
# Evaluate model
|
||||
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=30, deterministic=False)
|
||||
env.close()
|
||||
|
||||
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
|
||||
model.save(SAVE_PATH)
|
||||
|
||||
return mean_reward
|
||||
|
||||
except Exception as e:
|
||||
return -1
|
||||
|
||||
# Creating the experiment
|
||||
study = optuna.create_study(direction='maximize')
|
||||
study.optimize(optimize_agent, n_trials=10, n_jobs=1)
|
||||
|
||||
print(study.best_params)
|
||||
print(study.best_trial)
|
Loading…
Reference in New Issue
Block a user