hyperparameter

This commit is contained in:
linyiLYi 2023-04-03 20:50:10 +08:00
parent ded261ba69
commit 570d5bbe5c
30 changed files with 2798 additions and 255 deletions

View File

@ -1,3 +1,4 @@
import os
import time import time
import retro import retro
@ -5,6 +6,9 @@ from stable_baselines3.common.monitor import Monitor
from street_fighter_custom_wrapper import StreetFighterCustomWrapper from street_fighter_custom_wrapper import StreetFighterCustomWrapper
LOG_DIR = 'logs/'
os.makedirs(LOG_DIR, exist_ok=True)
def make_env(game, state): def make_env(game, state):
def _init(): def _init():
env = retro.make( env = retro.make(

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,4 @@
import math
import collections import collections
import gym import gym
@ -13,7 +14,9 @@ class StreetFighterCustomWrapper(gym.Wrapper):
self.num_frames = 3 self.num_frames = 3
self.frame_stack = collections.deque(maxlen=self.num_frames) self.frame_stack = collections.deque(maxlen=self.num_frames)
self.reward_coeff = 3 self.reward_coeff = 3.0
self.total_timesteps = 0
self.full_hp = 176 self.full_hp = 176
self.prev_player_health = self.full_hp self.prev_player_health = self.full_hp
@ -37,6 +40,8 @@ class StreetFighterCustomWrapper(gym.Wrapper):
observation = self.env.reset() observation = self.env.reset()
self.prev_player_health = self.full_hp self.prev_player_health = self.full_hp
self.prev_oppont_health = self.full_hp self.prev_oppont_health = self.full_hp
self.total_timesteps = 0
# Clear the frame stack and add the first observation [num_frames] times # Clear the frame stack and add the first observation [num_frames] times
self.frame_stack.clear() self.frame_stack.clear()
@ -50,20 +55,24 @@ class StreetFighterCustomWrapper(gym.Wrapper):
obs, _reward, _done, info = self.env.step(action) obs, _reward, _done, info = self.env.step(action)
curr_player_health = info['health'] curr_player_health = info['health']
curr_oppont_health = info['enemy_health'] curr_oppont_health = info['enemy_health']
self.total_timesteps += 1
# Game is over and player loses. # Game is over and player loses.
if curr_player_health < 0: if curr_player_health < 0:
custom_reward = -curr_oppont_health # Use the remaining health points of opponent as penalty. custom_reward = -math.pow(self.full_hp, (curr_oppont_health + 1) / (self.full_hp + 1)) # Use the remaining health points of opponent as penalty.
# If the opponent also has negative health points, it's a even game and the reward is +1. # If the opponent also has negative health points, it's a even game and the reward is +1.
custom_done = True custom_done = True
# Game is over and player wins. # Game is over and player wins.
elif curr_oppont_health < 0: elif curr_oppont_health < 0:
custom_reward = curr_player_health * self.reward_coeff # Use the remaining health points of player as reward. # custom_reward = curr_player_health * self.reward_coeff # Use the remaining health points of player as reward.
# Multiply by reward_coeff to make the reward larger than the penalty to avoid cowardice of agent. # Multiply by reward_coeff to make the reward larger than the penalty to avoid cowardice of agent.
custom_reward = math.pow(self.full_hp, (5940 - self.total_timesteps) / 5940) * self.reward_coeff # Use the remaining time steps as reward.
custom_done = True custom_done = True
# While the fighting is still going on. # While the fighting is still going on
else: else:
custom_reward = self.reward_coeff * (self.prev_oppont_health - curr_oppont_health) - (self.prev_player_health - curr_player_health) custom_reward = self.reward_coeff * (self.prev_oppont_health - curr_oppont_health) - (self.prev_player_health - curr_player_health)
self.prev_player_health = curr_player_health self.prev_player_health = curr_player_health
@ -75,5 +84,5 @@ class StreetFighterCustomWrapper(gym.Wrapper):
custom_done = False custom_done = False
# Max reward is 6 * full_hp = 1054 (damage * 3 + winning_reward * 3) # Max reward is 6 * full_hp = 1054 (damage * 3 + winning_reward * 3)
return self._preprocess_observation(obs), custom_reward, custom_done, info return self._preprocess_observation(obs), 0.001 * custom_reward, custom_done, info # reward normalization

View File

@ -20,30 +20,44 @@ def make_env(game, state):
game = "StreetFighterIISpecialChampionEdition-Genesis" game = "StreetFighterIISpecialChampionEdition-Genesis"
state_stages = [ state_stages = [
"Champion.Level1.RyuVsGuile", "Champion.Level1.RyuVsGuile",
"Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 | -20.4 "Champion.Level2.RyuVsKen",
"ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6 "Champion.Level3.RyuVsChunLi",
"Champion.Level2.ChunLiVsKen", "Champion.Level4.RyuVsZangief",
"Champion.Level3.ChunLiVsChunLi", "Champion.Level5.RyuVsDhalsim",
"Champion.Level4.ChunLiVsZangief", "Champion.Level6.RyuVsRyu",
"Champion.Level5.ChunLiVsDhalsim", "Champion.Level7.RyuVsEHonda",
"Champion.Level6.ChunLiVsRyu", "Champion.Level8.RyuVsBlanka",
"Champion.Level7.ChunLiVsEHonda", "Champion.Level9.RyuVsBalrog",
"Champion.Level8.ChunLiVsBlanka", "Champion.Level10.RyuVsVega",
"Champion.Level9.ChunLiVsBalrog", "Champion.Level11.RyuVsSagat",
"Champion.Level10.ChunLiVsVega", "Champion.Level12.RyuVsBison"
"Champion.Level11.ChunLiVsSagat",
"Champion.Level12.ChunLiVsBison"
# Add other stages as necessary
] ]
# state_stages = [
# "Champion.Level1.RyuVsGuile",
# "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 | -20.4
# "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
# "Champion.Level2.ChunLiVsKen",
# "Champion.Level3.ChunLiVsChunLi",
# "Champion.Level4.ChunLiVsZangief",
# "Champion.Level5.ChunLiVsDhalsim",
# "Champion.Level6.ChunLiVsRyu",
# "Champion.Level7.ChunLiVsEHonda",
# "Champion.Level8.ChunLiVsBlanka",
# "Champion.Level9.ChunLiVsBalrog",
# "Champion.Level10.ChunLiVsVega",
# "Champion.Level11.ChunLiVsSagat",
# "Champion.Level12.ChunLiVsBison"
# # Add other stages as necessary
# ]
env = make_env(game, state_stages[0])() env = make_env(game, state_stages[11])()
model = PPO( model = PPO(
"CnnPolicy", "CnnPolicy",
env, env,
verbose=1 verbose=1
) )
model_path = r"trained_models_level_1/ppo_ryu_000000_steps" model_path = r"trained_models_ryu_level_1_time_reward_small_random/ppo_ryu_2600000_steps"
model.load(model_path) model.load(model_path)
# Average reward for optuna/trial_1_best_model: -82.3 # Average reward for optuna/trial_1_best_model: -82.3
# Average reward for optuna/trial_9_best_model: 36.7 | -86.23 # Average reward for optuna/trial_9_best_model: 36.7 | -86.23
@ -60,6 +74,7 @@ for _ in range(num_episodes):
obs = env.reset() obs = env.reset()
total_reward = 0 total_reward = 0
while not done: while not done:
# while True:
timestamp = time.time() timestamp = time.time()
action, _states = model.predict(obs) action, _states = model.predict(obs)
obs, reward, done, info = env.step(action) obs, reward, done, info = env.step(action)
@ -68,9 +83,9 @@ for _ in range(num_episodes):
total_reward += reward total_reward += reward
print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health'])) print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health']))
env.render() env.render()
time.sleep(0.01) # time.sleep(0.005)
print("Total reward: {}".format(total_reward)) print("Total reward: {}".format(total_reward))
episode_reward_sum += total_reward episode_reward_sum += total_reward
env.close() # env.close()
print("Average reward for {}: {}".format(model_path, episode_reward_sum/num_episodes)) # print("Average reward for {}: {}".format(model_path, episode_reward_sum/num_episodes))

View File

@ -1,4 +1,5 @@
import os import os
import sys
import random import random
import retro import retro
@ -8,7 +9,7 @@ from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
from street_fighter_custom_wrapper import StreetFighterCustomWrapper from street_fighter_custom_wrapper import StreetFighterCustomWrapper
LOG_DIR = 'logs/' LOG_DIR = 'logs'
os.makedirs(LOG_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True)
class RandomOpponentChangeCallback(BaseCallback): class RandomOpponentChangeCallback(BaseCallback):
@ -58,21 +59,36 @@ def main():
state_stages = [ state_stages = [
"Champion.Level1.RyuVsGuile", "Champion.Level1.RyuVsGuile",
"Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 "Champion.Level2.RyuVsKen",
"Champion.Level2.ChunLiVsKen", "Champion.Level3.RyuVsChunLi",
"Champion.Level3.ChunLiVsChunLi", "Champion.Level4.RyuVsZangief",
"Champion.Level4.ChunLiVsZangief", "Champion.Level5.RyuVsDhalsim",
"Champion.Level5.ChunLiVsDhalsim", "Champion.Level6.RyuVsRyu",
"Champion.Level6.ChunLiVsRyu", "Champion.Level7.RyuVsEHonda",
"Champion.Level7.ChunLiVsEHonda", "Champion.Level8.RyuVsBlanka",
"Champion.Level8.ChunLiVsBlanka", "Champion.Level9.RyuVsBalrog",
"Champion.Level9.ChunLiVsBalrog", "Champion.Level10.RyuVsVega",
"Champion.Level10.ChunLiVsVega", "Champion.Level11.RyuVsSagat",
"Champion.Level11.ChunLiVsSagat", "Champion.Level12.RyuVsBison"
"Champion.Level12.ChunLiVsBison"
# Add other stages as necessary
] ]
# state_stages = [
# "Champion.Level1.RyuVsGuile",
# "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3
# "Champion.Level2.ChunLiVsKen",
# "Champion.Level3.ChunLiVsChunLi",
# "Champion.Level4.ChunLiVsZangief",
# "Champion.Level5.ChunLiVsDhalsim",
# "Champion.Level6.ChunLiVsRyu",
# "Champion.Level7.ChunLiVsEHonda",
# "Champion.Level8.ChunLiVsBlanka",
# "Champion.Level9.ChunLiVsBalrog",
# "Champion.Level10.ChunLiVsVega",
# "Champion.Level11.ChunLiVsSagat",
# "Champion.Level12.ChunLiVsBison"
# # Add other stages as necessary
# ]
# state_stages = [ # state_stages = [
# "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6 # "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
# "ChampionX.Level2.ChunLiVsChunLi", # "ChampionX.Level2.ChunLiVsChunLi",
@ -103,31 +119,27 @@ def main():
n_steps=1024, n_steps=1024,
batch_size=64, batch_size=64,
learning_rate=1e-4, learning_rate=1e-4,
ent_coef=0.01, tensorboard_log="logs"
clip_range=0.2,
gamma=0.95,
gae_lambda=0.81322,
tensorboard_log="logs/"
) )
# Set the save directory # Set the save directory
save_dir = "trained_models_ryu_level_1_reward_x3" save_dir = "trained_models_ryu_level_1_time_reward_small_random"
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
# Load the model from file # Load the model from file
# model_path = "trained_models/ppo_chunli_1296000_steps.zip" model_path = "trained_models_ryu_level_1_time_reward_small_continue/ppo_ryu_400000_steps.zip"
# Load model and modify the learning rate and entropy coefficient # Load model and modify the learning rate and entropy coefficient
# custom_objects = { # custom_objects = {
# "learning_rate": 0.0002 # "learning_rate": 0.0002
# } # }
# model = PPO.load(model_path, env=env, device="cuda")#, custom_objects=custom_objects) model = PPO.load(model_path, env=env, device="cuda")#, custom_objects=custom_objects)
# Set up callbacks # Set up callbacks
# opponent_interval = 35840 # stage_interval * num_envs = total_steps_per_stage opponent_interval = 32768 # stage_interval * num_envs = total_steps_per_stage
checkpoint_interval = 200000 # checkpoint_interval * num_envs = total_steps_per_checkpoint (Every 80 rounds) checkpoint_interval = 200000 # checkpoint_interval * num_envs = total_steps_per_checkpoint (Every 80 rounds)
checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path=save_dir, name_prefix="ppo_ryu") checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path=save_dir, name_prefix="ppo_ryu")
# stage_increase_callback = RandomOpponentChangeCallback(state_stages, opponent_interval, save_dir) stage_increase_callback = RandomOpponentChangeCallback(state_stages, opponent_interval, save_dir)
# model_params = { # model_params = {
# 'n_steps': 5, # 'n_steps': 5,
@ -141,12 +153,21 @@ def main():
# } # }
# model = A2C('CnnPolicy', env, tensorboard_log='logs/', verbose=1, **model_params, policy_kwargs=dict(optimizer_class=RMSpropTF)) # model = A2C('CnnPolicy', env, tensorboard_log='logs/', verbose=1, **model_params, policy_kwargs=dict(optimizer_class=RMSpropTF))
model.learn( # Writing the training logs from stdout to a file
total_timesteps=int(10000000), # total_timesteps = stage_interval * num_envs * num_stages (1120 rounds) original_stdout = sys.stdout
callback=[checkpoint_callback]#, stage_increase_callback] log_file_path = os.path.join(save_dir, "training_log.txt")
) with open(log_file_path, 'w') as log_file:
env.close() sys.stdout = log_file
model.learn(
total_timesteps=int(10000000), # total_timesteps = stage_interval * num_envs * num_stages (1120 rounds)
callback=[checkpoint_callback, stage_increase_callback]
)
env.close()
# Restore stdout
sys.stdout = original_stdout
# Save the final model # Save the final model
model.save(os.path.join(save_dir, "ppo_sf2_ryu_final.zip")) model.save(os.path.join(save_dir, "ppo_sf2_ryu_final.zip"))

View File

@ -0,0 +1,68 @@
import os
import retro
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
LOG_DIR = 'logs/'
OPT_DIR = 'optuna/'
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(OPT_DIR, exist_ok=True)
def optimize_ppo(trial):
return {
'n_steps':trial.suggest_int('n_steps', 512, 2048, log=True),
'gamma':trial.suggest_float('gamma', 0.9, 0.9999),
'learning_rate':trial.suggest_float('learning_rate', 5e-5, 5e-4, log=True),
'gae_lambda':trial.suggest_float('gae_lambda', 0.8, 0.9999)
}
def make_env(game, state):
def _init():
env = retro.make(
game=game,
state=state,
use_restricted_actions=retro.Actions.FILTERED,
obs_type=retro.Observations.IMAGE
)
env = StreetFighterCustomWrapper(env)
return env
return _init
def optimize_agent(trial):
game = "StreetFighterIISpecialChampionEdition-Genesis"
state = "Champion.Level1.ChunLiVsGuile"#"ChampionX.Level1.ChunLiVsKen"
try:
model_params = optimize_ppo(trial)
# Create environment
env = make_env(game, state)()
env = Monitor(env, LOG_DIR)
# Create algo
model = PPO('CnnPolicy', env, verbose=1, **model_params)
model.learn(total_timesteps=500000)
# Evaluate model
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=30, deterministic=False)
env.close()
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
model.save(SAVE_PATH)
return mean_reward
except Exception as e:
return -1
# Creating the experiment
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)
print(study.best_params)
print(study.best_trial)

View File

@ -0,0 +1,51 @@
import os
import time
import retro
from stable_baselines3.common.monitor import Monitor
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
LOG_DIR = 'logs/'
os.makedirs(LOG_DIR, exist_ok=True)
def make_env(game, state):
def _init():
env = retro.make(
game=game,
state=state,
use_restricted_actions=retro.Actions.FILTERED,
obs_type=retro.Observations.IMAGE
)
env = StreetFighterCustomWrapper(env)
return env
return _init
game = "StreetFighterIISpecialChampionEdition-Genesis"
state = "Champion.Level1.RyuVsGuile"
env = make_env(game, state)()
env = Monitor(env, 'logs/')
num_episodes = 30
episode_reward_sum = 0
for _ in range(num_episodes):
done = False
obs = env.reset()
total_reward = 0
while not done:
timestamp = time.time()
obs, reward, done, info = env.step(env.action_space.sample())
# Note that if player wins but only has 0 HP left, the winning reward is still 0, so it won't be printed.
if reward != 0:
total_reward += reward
print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health']))
env.render()
# time.sleep(0.005)
print("Total reward: {}".format(total_reward))
episode_reward_sum += total_reward
env.close()
print("Average reward for random strategy: {}".format(episode_reward_sum/num_episodes))

View File

@ -0,0 +1,24 @@
import gym
import torch
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
# Custom feature extractor (CNN)
class CustomCNN(BaseFeaturesExtractor):
def __init__(self, observation_space: gym.Space):
super(CustomCNN, self).__init__(observation_space, features_dim=512)
self.cnn = nn.Sequential(
nn.Conv2d(4, 32, kernel_size=5, stride=2, padding=0),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=0),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
nn.ReLU(),
nn.Flatten(),
nn.Linear(16384, self.features_dim),
nn.ReLU()
)
def forward(self, observations: torch.Tensor) -> torch.Tensor:
return self.cnn(observations)

View File

@ -0,0 +1,52 @@
import retro
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from custom_cnn import CustomCNN
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
def make_env(game, state):
def _init():
env = retro.make(
game=game,
state=state,
use_restricted_actions=retro.Actions.FILTERED,
obs_type=retro.Observations.IMAGE
)
env = StreetFighterCustomWrapper(env)
return env
return _init
game = "StreetFighterIISpecialChampionEdition-Genesis"
state_stages = [
"Champion.Level1.ChunLiVsGuile",
"Champion.Level2.ChunLiVsKen",
"Champion.Level3.ChunLiVsChunLi",
"Champion.Level4.ChunLiVsZangief",
"Champion.Level5.ChunLiVsDhalsim",
"Champion.Level6.ChunLiVsRyu",
"Champion.Level7.ChunLiVsEHonda",
"Champion.Level8.ChunLiVsBlanka",
"Champion.Level9.ChunLiVsBalrog",
"Champion.Level10.ChunLiVsVega",
"Champion.Level11.ChunLiVsSagat",
"Champion.Level12.ChunLiVsBison"
# Add other stages as necessary
]
env = make_env(game, state_stages[0])()
# Wrap the environment
# env = Monitor(env, 'logs/')
policy_kwargs = {'features_extractor_class': CustomCNN}
model = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs)
model = PPO.load(r"dummy_model_ppo_chunli")
# model.load(r"trained_models/ppo_chunli_864000_steps")
mean_reward, std_reward = evaluate_policy(model, env, render=True, n_eval_episodes=10, deterministic=False, return_episode_rewards=True)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

View File

@ -0,0 +1,72 @@
#{"t_start": 1680493345.1918354, "env_id": null}
r,l,t
121.963397,1218,4.678451
150.73549,1321,10.670471
-68.875587,1685,16.915831
110.079359,1992,23.723121
-142.422045,2475,30.929999
25.618847,2409,37.817851
-10.399282,2048,44.290086
103.210152,2378,51.321482
-10.83336,1410,56.861604
-109.50968,2320,63.936125
-56.133868,1884,70.48871
-1.263258,3856,83.55072
137.096617,1955,89.782694
-173.590936,2482,96.620559
-32.06399,2622,103.507519
-33.546645,2359,110.155689
247.618832,631,114.254022
34.18776,1988,121.014232
-16.0,2372,128.411761
142.528257,1504,131.040915
-35.511721,2100,141.725583
210.669563,1386,144.090119
106.166972,1756,150.146413
-291.452157,1771,156.139224
-103.271598,2993,163.884984
-3.09159,2881,176.148215
-69.312432,2968,184.263354
11.966888,3420,196.801994
-72.699861,2025,203.564955
-148.405698,1736,209.933663
-6.505259,2863,217.596923
-45.308495,1642,223.866472
-60.133868,1684,230.124691
20.320741,1912,236.920414
12.850028,3201,245.687548
44.331029,2020,252.592099
4.970357,2426,260.241496
-26.074153,3193,272.014878
41.36973,2057,278.669485
-58.986068,1951,285.105779
-6.22689,3090,293.487389
42.518735,2469,304.130861
28.699296,1979,310.846102
-0.123948,2276,318.259012
-130.860871,3166,326.553931
-94.074138,2784,337.55811
-6.779892,1688,340.079623
-2.846787,1842,346.163562
-6.846787,2137,352.507397
74.03875,2905,364.386266
91.493381,1660,370.441151
-60.286049,1502,372.846128
-127.450026,2030,379.306661
-90.028712,1565,385.293074
-45.615818,2020,391.765668
-49.94699,3488,403.627649
-90.632338,2270,410.494766
46.011777,3184,419.074744
-11.461924,2326,429.595886
181.774886,1260,431.709853
-40.06399,2863,442.810548
-90.86073,2232,449.485116
72.549001,2173,455.957089
66.832361,1597,461.523406
29.003218,1923,464.330122
-57.986068,3347,476.034669
175.784026,1320,481.626563
-328.0,1320,483.822258
-81.578734,2842,495.411107
72.161772,1818,501.982363
Can't render this file because it contains an unexpected character in line 1 and column 3.

View File

@ -0,0 +1,88 @@
import math
import collections
import gym
import numpy as np
# Custom environment wrapper
class StreetFighterCustomWrapper(gym.Wrapper):
def __init__(self, env, testing=False):
super(StreetFighterCustomWrapper, self).__init__(env)
self.env = env
# Use a deque to store the last 4 frames
self.num_frames = 3
self.frame_stack = collections.deque(maxlen=self.num_frames)
self.reward_coeff = 1.0
self.total_timesteps = 0
self.full_hp = 176
self.prev_player_health = self.full_hp
self.prev_oppont_health = self.full_hp
# Update observation space to include stacked grayscale images
self.observation_space = gym.spaces.Box(low=0, high=255, shape=(100, 128, 3), dtype=np.uint8)
self.testing = testing
def _preprocess_observation(self, observation):
# Stack the downsampled frames.
self.frame_stack.append(observation[::2, ::2, :])
# Stack the R, G, B channel of each frame and return the "image".
stacked_image = np.stack([frame[:, :, i] for i, frame in enumerate(self.frame_stack)], axis=-1)
return stacked_image
def reset(self):
observation = self.env.reset()
self.prev_player_health = self.full_hp
self.prev_oppont_health = self.full_hp
self.total_timesteps = 0
# Clear the frame stack and add the first observation [num_frames] times
self.frame_stack.clear()
for _ in range(self.num_frames):
self.frame_stack.append(observation[::2, ::2, :])
return np.stack([frame[:, :, i] for i, frame in enumerate(self.frame_stack)], axis=-1)
def step(self, action):
obs, _reward, _done, info = self.env.step(action)
curr_player_health = info['health']
curr_oppont_health = info['enemy_health']
self.total_timesteps += 1
# Game is over and player loses.
if curr_player_health < 0:
custom_reward = -math.pow(self.full_hp, (curr_oppont_health + 1) / (self.full_hp + 1)) # Use the remaining health points of opponent as penalty.
# If the opponent also has negative health points, it's a even game and the reward is +1.
custom_done = True
# Game is over and player wins.
elif curr_oppont_health < 0:
# custom_reward = curr_player_health * self.reward_coeff # Use the remaining health points of player as reward.
# Multiply by reward_coeff to make the reward larger than the penalty to avoid cowardice of agent.
custom_reward = math.pow(self.full_hp, (5940 - self.total_timesteps) / 5940)
custom_done = True
# While the fighting is still going on
else:
custom_reward = self.reward_coeff * (self.prev_oppont_health - curr_oppont_health) - (self.prev_player_health - curr_player_health)
self.prev_player_health = curr_player_health
self.prev_oppont_health = curr_oppont_health
custom_done = False
# During testing, the session should always keep going.
if self.testing:
custom_done = False
# Max reward is 6 * full_hp = 1054 (damage * 3 + winning_reward * 3)
return self._preprocess_observation(obs), custom_reward, custom_done, info

View File

@ -0,0 +1,76 @@
import time
import retro
from stable_baselines3 import PPO
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
def make_env(game, state):
def _init():
env = retro.make(
game=game,
state=state,
use_restricted_actions=retro.Actions.FILTERED,
obs_type=retro.Observations.IMAGE
)
env = StreetFighterCustomWrapper(env)
return env
return _init
game = "StreetFighterIISpecialChampionEdition-Genesis"
state_stages = [
"Champion.Level1.RyuVsGuile",
"Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 | -20.4
"ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
"Champion.Level2.ChunLiVsKen",
"Champion.Level3.ChunLiVsChunLi",
"Champion.Level4.ChunLiVsZangief",
"Champion.Level5.ChunLiVsDhalsim",
"Champion.Level6.ChunLiVsRyu",
"Champion.Level7.ChunLiVsEHonda",
"Champion.Level8.ChunLiVsBlanka",
"Champion.Level9.ChunLiVsBalrog",
"Champion.Level10.ChunLiVsVega",
"Champion.Level11.ChunLiVsSagat",
"Champion.Level12.ChunLiVsBison"
# Add other stages as necessary
]
env = make_env(game, state_stages[0])()
model = PPO(
"CnnPolicy",
env,
verbose=1
)
model_path = r"trained_models_ryu_level_1_reward_x3/ppo_ryu_6600000_steps"
model.load(model_path)
# Average reward for optuna/trial_1_best_model: -82.3
# Average reward for optuna/trial_9_best_model: 36.7 | -86.23
# Average reward for trained_models/ppo_chunli_5376000_steps: -77.8
obs = env.reset()
done = False
num_episodes = 30
episode_reward_sum = 0
for _ in range(num_episodes):
done = False
obs = env.reset()
total_reward = 0
while True:
timestamp = time.time()
action, _states = model.predict(obs)
obs, reward, done, info = env.step(action)
if reward != 0:
total_reward += reward
print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health']))
env.render()
# time.sleep(0.005)
# print("Total reward: {}".format(total_reward))
# episode_reward_sum += total_reward
# env.close()
# print("Average reward for {}: {}".format(model_path, episode_reward_sum/num_episodes))

View File

@ -0,0 +1,151 @@
import os
import random
import retro
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
LOG_DIR = 'logs'
os.makedirs(LOG_DIR, exist_ok=True)
class RandomOpponentChangeCallback(BaseCallback):
def __init__(self, stages, opponent_interval, verbose=0):
super(RandomOpponentChangeCallback, self).__init__(verbose)
self.stages = stages
self.opponent_interval = opponent_interval
def _on_step(self) -> bool:
if self.n_calls % self.opponent_interval == 0:
new_state = random.choice(self.stages)
print("\nCurrent state:", new_state)
self.training_env.env_method("load_state", new_state, indices=None)
return True
# class StageIncreaseCallback(BaseCallback):
# def __init__(self, stages, stage_interval, save_dir, verbose=0):
# super(StageIncreaseCallback, self).__init__(verbose)
# self.stages = stages
# self.stage_interval = stage_interval
# self.save_dir = save_dir
# self.current_stage = 0
# def _on_step(self) -> bool:
# if self.n_calls % self.stage_interval == 0 and self.current_stage < len(self.stages) - 1:
# self.current_stage += 1
# new_state = self.stages[self.current_stage]
# self.training_env.env_method("load_state", new_state, indices=None)
# self.model.save(os.path.join(self.save_dir, f"ppo_chunli_stage_{self.current_stage}.zip"))
# return True
def make_env(game, state):
def _init():
env = retro.make(
game=game,
state=state,
use_restricted_actions=retro.Actions.FILTERED,
obs_type=retro.Observations.IMAGE
)
env = StreetFighterCustomWrapper(env)
return env
return _init
def main():
# Set up the environment and model
game = "StreetFighterIISpecialChampionEdition-Genesis"
state_stages = [
"Champion.Level1.RyuVsGuile",
"Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3
"Champion.Level2.ChunLiVsKen",
"Champion.Level3.ChunLiVsChunLi",
"Champion.Level4.ChunLiVsZangief",
"Champion.Level5.ChunLiVsDhalsim",
"Champion.Level6.ChunLiVsRyu",
"Champion.Level7.ChunLiVsEHonda",
"Champion.Level8.ChunLiVsBlanka",
"Champion.Level9.ChunLiVsBalrog",
"Champion.Level10.ChunLiVsVega",
"Champion.Level11.ChunLiVsSagat",
"Champion.Level12.ChunLiVsBison"
# Add other stages as necessary
]
# state_stages = [
# "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
# "ChampionX.Level2.ChunLiVsChunLi",
# "ChampionX.Level3.ChunLiVsZangief",
# "ChampionX.Level4.ChunLiVsDhalsim",
# "ChampionX.Level5.ChunLiVsRyu",
# "ChampionX.Level6.ChunLiVsEHonda",
# "ChampionX.Level7.ChunLiVsBlanka",
# "ChampionX.Level8.ChunLiVsGuile",
# "ChampionX.Level9.ChunLiVsBalrog",
# "ChampionX.Level10.ChunLiVsVega",
# "ChampionX.Level11.ChunLiVsSagat",
# "ChampionX.Level12.ChunLiVsBison"
# # Add other stages as necessary
# ]
# Champion is at difficulty level 4, ChampionX is at difficulty level 8.
env = make_env(game, state_stages[0])()
# Warp env in Monitor wrapper to record training progress
env = Monitor(env, LOG_DIR)
model = PPO(
"CnnPolicy",
env,
device="cuda",
verbose=1,
n_steps=2048,
batch_size=64,
learning_rate=1e-4,
gamma=0.99,
tensorboard_log="logs"
)
# Set the save directory
save_dir = "trained_models_ryu_level_1_time_reward"
os.makedirs(save_dir, exist_ok=True)
# Load the model from file
# model_path = "trained_models/ppo_chunli_1296000_steps.zip"
# Load model and modify the learning rate and entropy coefficient
# custom_objects = {
# "learning_rate": 0.0002
# }
# model = PPO.load(model_path, env=env, device="cuda")#, custom_objects=custom_objects)
# Set up callbacks
# opponent_interval = 35840 # stage_interval * num_envs = total_steps_per_stage
checkpoint_interval = 200000 # checkpoint_interval * num_envs = total_steps_per_checkpoint (Every 80 rounds)
checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path=save_dir, name_prefix="ppo_ryu")
# stage_increase_callback = RandomOpponentChangeCallback(state_stages, opponent_interval, save_dir)
# model_params = {
# 'n_steps': 5,
# 'gamma': 0.99,
# 'gae_lambda':1,
# 'learning_rate': 7e-4,
# 'vf_coef': 0.5,
# 'ent_coef': 0.0,
# 'max_grad_norm':0.5,
# 'rms_prop_eps':1e-05
# }
# model = A2C('CnnPolicy', env, tensorboard_log='logs/', verbose=1, **model_params, policy_kwargs=dict(optimizer_class=RMSpropTF))
model.learn(
total_timesteps=int(10000000), # total_timesteps = stage_interval * num_envs * num_stages (1120 rounds)
callback=[checkpoint_callback]#, stage_increase_callback]
)
env.close()
# Save the final model
model.save(os.path.join(save_dir, "ppo_sf2_ryu_final.zip"))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,68 @@
import os
import retro
import optuna
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from street_fighter_custom_wrapper import StreetFighterCustomWrapper
LOG_DIR = 'logs/'
OPT_DIR = 'optuna/'
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(OPT_DIR, exist_ok=True)
def optimize_ppo(trial):
return {
'n_steps':trial.suggest_int('n_steps', 512, 2048, log=True),
'gamma':trial.suggest_float('gamma', 0.9, 0.9999),
'learning_rate':trial.suggest_float('learning_rate', 5e-5, 5e-4, log=True),
'gae_lambda':trial.suggest_float('gae_lambda', 0.8, 0.9999)
}
def make_env(game, state):
def _init():
env = retro.make(
game=game,
state=state,
use_restricted_actions=retro.Actions.FILTERED,
obs_type=retro.Observations.IMAGE
)
env = StreetFighterCustomWrapper(env)
return env
return _init
def optimize_agent(trial):
game = "StreetFighterIISpecialChampionEdition-Genesis"
state = "Champion.Level1.ChunLiVsGuile"#"ChampionX.Level1.ChunLiVsKen"
try:
model_params = optimize_ppo(trial)
# Create environment
env = make_env(game, state)()
env = Monitor(env, LOG_DIR)
# Create algo
model = PPO('CnnPolicy', env, verbose=1, **model_params)
model.learn(total_timesteps=500000)
# Evaluate model
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=30, deterministic=False)
env.close()
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
model.save(SAVE_PATH)
return mean_reward
except Exception as e:
return -1
# Creating the experiment
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)
print(study.best_params)
print(study.best_trial)