hyperparameter

2025-04-04 15:10:43 +00:00 · 2023-04-03 20:50:10 +08:00 · 2023-04-03 20:50:10 +08:00 · 570d5bbe5c
commit 570d5bbe5c
parent ded261ba69
30 changed files with 2798 additions and 255 deletions
--- a/004_image_stack_ram_based_reward_custom/pycache/street_fighter_custom_wrapper.cpython-38.pyc
+++ b/004_image_stack_ram_based_reward_custom/pycache/street_fighter_custom_wrapper.cpython-38.pyc
--- a/004_image_stack_ram_based_reward_custom/check_reward.py
+++ b/004_image_stack_ram_based_reward_custom/check_reward.py
@ -1,3 +1,4 @@
 import os
 import time 
 import retro
@ -5,6 +6,9 @@ from stable_baselines3.common.monitor import Monitor
 from street_fighter_custom_wrapper import StreetFighterCustomWrapper
 LOG_DIR = 'logs/'
 os.makedirs(LOG_DIR, exist_ok=True)
 def make_env(game, state):
    def _init():
        env = retro.make(
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_10/events.out.tfevents.1680505905.DESKTOP-9E17TO7.28664.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_10/events.out.tfevents.1680505905.DESKTOP-9E17TO7.28664.0
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_11/events.out.tfevents.1680509256.DESKTOP-9E17TO7.5124.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_11/events.out.tfevents.1680509256.DESKTOP-9E17TO7.5124.0
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_12/events.out.tfevents.1680515264.DESKTOP-9E17TO7.29580.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_12/events.out.tfevents.1680515264.DESKTOP-9E17TO7.29580.0
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_3/events.out.tfevents.1680450538.DESKTOP-9E17TO7.4520.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_3/events.out.tfevents.1680450538.DESKTOP-9E17TO7.4520.0
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_4/events.out.tfevents.1680494082.DESKTOP-9E17TO7.1808.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_4/events.out.tfevents.1680494082.DESKTOP-9E17TO7.1808.0
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_5/events.out.tfevents.1680501562.DESKTOP-9E17TO7.12740.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_5/events.out.tfevents.1680501562.DESKTOP-9E17TO7.12740.0
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_6/events.out.tfevents.1680502870.DESKTOP-9E17TO7.21800.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_6/events.out.tfevents.1680502870.DESKTOP-9E17TO7.21800.0
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_7/events.out.tfevents.1680503637.DESKTOP-9E17TO7.30508.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_7/events.out.tfevents.1680503637.DESKTOP-9E17TO7.30508.0
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_8/events.out.tfevents.1680504617.DESKTOP-9E17TO7.5124.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_8/events.out.tfevents.1680504617.DESKTOP-9E17TO7.5124.0
--- a/004_image_stack_ram_based_reward_custom/logs/PPO_9/events.out.tfevents.1680505010.DESKTOP-9E17TO7.28200.0
+++ b/004_image_stack_ram_based_reward_custom/logs/PPO_9/events.out.tfevents.1680505010.DESKTOP-9E17TO7.28200.0
--- a/004_image_stack_ram_based_reward_custom/logs/monitor.csv
+++ b/004_image_stack_ram_based_reward_custom/logs/monitor.csv
--- a/004_image_stack_ram_based_reward_custom/street_fighter_custom_wrapper.py
+++ b/004_image_stack_ram_based_reward_custom/street_fighter_custom_wrapper.py
@ -1,3 +1,4 @@
 import math
 import collections
 import gym
@ -13,7 +14,9 @@ class StreetFighterCustomWrapper(gym.Wrapper):
        self.num_frames = 3
        self.frame_stack = collections.deque(maxlen=self.num_frames)
-        self.reward_coeff = 3
+        self.reward_coeff = 3.0
        self.total_timesteps = 0
        self.full_hp = 176
        self.prev_player_health = self.full_hp
@ -37,6 +40,8 @@ class StreetFighterCustomWrapper(gym.Wrapper):
        observation = self.env.reset()
        self.prev_player_health = self.full_hp
        self.prev_oppont_health = self.full_hp
        self.total_timesteps = 0
        # Clear the frame stack and add the first observation [num_frames] times
        self.frame_stack.clear()
@ -50,20 +55,24 @@ class StreetFighterCustomWrapper(gym.Wrapper):
        obs, _reward, _done, info = self.env.step(action)
        curr_player_health = info['health']
        curr_oppont_health = info['enemy_health']
        self.total_timesteps += 1
        # Game is over and player loses.
        if curr_player_health < 0:
-            custom_reward = -curr_oppont_health    # Use the remaining health points of opponent as penalty. 
+            custom_reward = -math.pow(self.full_hp, (curr_oppont_health + 1) / (self.full_hp + 1))    # Use the remaining health points of opponent as penalty. 
                                                   # If the opponent also has negative health points, it's a even game and the reward is +1.
            custom_done = True
        # Game is over and player wins.
        elif curr_oppont_health < 0:
-            custom_reward = curr_player_health * self.reward_coeff # Use the remaining health points of player as reward.
+            # custom_reward = curr_player_health * self.reward_coeff # Use the remaining health points of player as reward.
                                                                   # Multiply by reward_coeff to make the reward larger than the penalty to avoid cowardice of agent.
            custom_reward = math.pow(self.full_hp, (5940 - self.total_timesteps) / 5940) * self.reward_coeff # Use the remaining time steps as reward.
            custom_done = True
-        # While the fighting is still going on.
+        # While the fighting is still going on
        else:
            custom_reward = self.reward_coeff * (self.prev_oppont_health - curr_oppont_health) - (self.prev_player_health - curr_player_health)
            self.prev_player_health = curr_player_health
@ -75,5 +84,5 @@ class StreetFighterCustomWrapper(gym.Wrapper):
            custom_done = False
        # Max reward is 6 * full_hp = 1054 (damage * 3 + winning_reward * 3) 
-        return self._preprocess_observation(obs), custom_reward, custom_done, info
+        return self._preprocess_observation(obs), 0.001 * custom_reward, custom_done, info # reward normalization
--- a/004_image_stack_ram_based_reward_custom/test.py
+++ b/004_image_stack_ram_based_reward_custom/test.py
@ -20,30 +20,44 @@ def make_env(game, state):
 game = "StreetFighterIISpecialChampionEdition-Genesis"
 state_stages = [
    "Champion.Level1.RyuVsGuile",
-    "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 | -20.4
+    "Champion.Level2.RyuVsKen",
-    "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
+    "Champion.Level3.RyuVsChunLi",
-    "Champion.Level2.ChunLiVsKen",
+    "Champion.Level4.RyuVsZangief",
-    "Champion.Level3.ChunLiVsChunLi",
+    "Champion.Level5.RyuVsDhalsim",
-    "Champion.Level4.ChunLiVsZangief",
+    "Champion.Level6.RyuVsRyu",
-    "Champion.Level5.ChunLiVsDhalsim",
+    "Champion.Level7.RyuVsEHonda",
-    "Champion.Level6.ChunLiVsRyu",
+    "Champion.Level8.RyuVsBlanka",
-    "Champion.Level7.ChunLiVsEHonda",
+    "Champion.Level9.RyuVsBalrog",
-    "Champion.Level8.ChunLiVsBlanka",
+    "Champion.Level10.RyuVsVega",
-    "Champion.Level9.ChunLiVsBalrog",
+    "Champion.Level11.RyuVsSagat",
-    "Champion.Level10.ChunLiVsVega",
+    "Champion.Level12.RyuVsBison"
    "Champion.Level11.ChunLiVsSagat",
    "Champion.Level12.ChunLiVsBison"
    # Add other stages as necessary
 ]
 # state_stages = [
 #     "Champion.Level1.RyuVsGuile",
 #     "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 | -20.4
 #     "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
 #     "Champion.Level2.ChunLiVsKen",
 #     "Champion.Level3.ChunLiVsChunLi",
 #     "Champion.Level4.ChunLiVsZangief",
 #     "Champion.Level5.ChunLiVsDhalsim",
 #     "Champion.Level6.ChunLiVsRyu",
 #     "Champion.Level7.ChunLiVsEHonda",
 #     "Champion.Level8.ChunLiVsBlanka",
 #     "Champion.Level9.ChunLiVsBalrog",
 #     "Champion.Level10.ChunLiVsVega",
 #     "Champion.Level11.ChunLiVsSagat",
 #     "Champion.Level12.ChunLiVsBison"
 #     # Add other stages as necessary
 # ]
-env = make_env(game, state_stages[0])()
+env = make_env(game, state_stages[11])()
 model = PPO(
    "CnnPolicy", 
    env,
    verbose=1
 )
-model_path = r"trained_models_level_1/ppo_ryu_000000_steps"
+model_path = r"trained_models_ryu_level_1_time_reward_small_random/ppo_ryu_2600000_steps"
 model.load(model_path)
 # Average reward for optuna/trial_1_best_model: -82.3
 # Average reward for optuna/trial_9_best_model: 36.7 | -86.23
@ -60,6 +74,7 @@ for _ in range(num_episodes):
    obs = env.reset()
    total_reward = 0
    while not done:
    # while True:
        timestamp = time.time()
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
@ -68,9 +83,9 @@ for _ in range(num_episodes):
            total_reward += reward
            print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health']))
        env.render()
-        time.sleep(0.01)
+        # time.sleep(0.005)
    print("Total reward: {}".format(total_reward))
    episode_reward_sum += total_reward
-env.close()
+# env.close()
-print("Average reward for {}: {}".format(model_path, episode_reward_sum/num_episodes))
+# print("Average reward for {}: {}".format(model_path, episode_reward_sum/num_episodes))
--- a/004_image_stack_ram_based_reward_custom/train.py
+++ b/004_image_stack_ram_based_reward_custom/train.py
@ -1,4 +1,5 @@
 import os
 import sys
 import random
 import retro
@ -8,7 +9,7 @@ from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
 from street_fighter_custom_wrapper import StreetFighterCustomWrapper
-LOG_DIR = 'logs/'
+LOG_DIR = 'logs'
 os.makedirs(LOG_DIR, exist_ok=True)
 class RandomOpponentChangeCallback(BaseCallback):
@ -58,21 +59,36 @@ def main():
    state_stages = [
        "Champion.Level1.RyuVsGuile",
-        "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3
+        "Champion.Level2.RyuVsKen",
-        "Champion.Level2.ChunLiVsKen",
+        "Champion.Level3.RyuVsChunLi",
-        "Champion.Level3.ChunLiVsChunLi",
+        "Champion.Level4.RyuVsZangief",
-        "Champion.Level4.ChunLiVsZangief",
+        "Champion.Level5.RyuVsDhalsim",
-        "Champion.Level5.ChunLiVsDhalsim",
+        "Champion.Level6.RyuVsRyu",
-        "Champion.Level6.ChunLiVsRyu",
+        "Champion.Level7.RyuVsEHonda",
-        "Champion.Level7.ChunLiVsEHonda",
+        "Champion.Level8.RyuVsBlanka",
-        "Champion.Level8.ChunLiVsBlanka",
+        "Champion.Level9.RyuVsBalrog",
-        "Champion.Level9.ChunLiVsBalrog",
+        "Champion.Level10.RyuVsVega",
-        "Champion.Level10.ChunLiVsVega",
+        "Champion.Level11.RyuVsSagat",
-        "Champion.Level11.ChunLiVsSagat",
+        "Champion.Level12.RyuVsBison"
        "Champion.Level12.ChunLiVsBison"
        # Add other stages as necessary
    ]
    # state_stages = [
    #     "Champion.Level1.RyuVsGuile",
    #     "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3
    #     "Champion.Level2.ChunLiVsKen",
    #     "Champion.Level3.ChunLiVsChunLi",
    #     "Champion.Level4.ChunLiVsZangief",
    #     "Champion.Level5.ChunLiVsDhalsim",
    #     "Champion.Level6.ChunLiVsRyu",
    #     "Champion.Level7.ChunLiVsEHonda",
    #     "Champion.Level8.ChunLiVsBlanka",
    #     "Champion.Level9.ChunLiVsBalrog",
    #     "Champion.Level10.ChunLiVsVega",
    #     "Champion.Level11.ChunLiVsSagat",
    #     "Champion.Level12.ChunLiVsBison"
    #     # Add other stages as necessary
    # ]
    # state_stages = [
    #     "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
    #     "ChampionX.Level2.ChunLiVsChunLi",
@ -103,31 +119,27 @@ def main():
        n_steps=1024,
        batch_size=64,
        learning_rate=1e-4,
-        ent_coef=0.01,
+        tensorboard_log="logs"
        clip_range=0.2,
        gamma=0.95,
        gae_lambda=0.81322,
        tensorboard_log="logs/"
    )
    # Set the save directory
-    save_dir = "trained_models_ryu_level_1_reward_x3"
+    save_dir = "trained_models_ryu_level_1_time_reward_small_random"
    os.makedirs(save_dir, exist_ok=True)
    # Load the model from file
-    # model_path = "trained_models/ppo_chunli_1296000_steps.zip"
+    model_path = "trained_models_ryu_level_1_time_reward_small_continue/ppo_ryu_400000_steps.zip"
    # Load model and modify the learning rate and entropy coefficient
    # custom_objects = {
    #     "learning_rate": 0.0002
    # }
-    # model = PPO.load(model_path, env=env, device="cuda")#, custom_objects=custom_objects)
+    model = PPO.load(model_path, env=env, device="cuda")#, custom_objects=custom_objects)
    # Set up callbacks
-    # opponent_interval = 35840 # stage_interval * num_envs = total_steps_per_stage
+    opponent_interval = 32768 # stage_interval * num_envs = total_steps_per_stage
    checkpoint_interval = 200000 # checkpoint_interval * num_envs = total_steps_per_checkpoint (Every 80 rounds)
    checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path=save_dir, name_prefix="ppo_ryu")
-    # stage_increase_callback = RandomOpponentChangeCallback(state_stages, opponent_interval, save_dir)
+    stage_increase_callback = RandomOpponentChangeCallback(state_stages, opponent_interval, save_dir)
    # model_params = {
    #     'n_steps': 5, 
@ -141,12 +153,21 @@ def main():
    # }
    # model = A2C('CnnPolicy', env, tensorboard_log='logs/', verbose=1, **model_params, policy_kwargs=dict(optimizer_class=RMSpropTF))
-    model.learn(
+    # Writing the training logs from stdout to a file
-        total_timesteps=int(10000000), # total_timesteps = stage_interval * num_envs * num_stages (1120 rounds)
+    original_stdout = sys.stdout
-        callback=[checkpoint_callback]#, stage_increase_callback]
+    log_file_path = os.path.join(save_dir, "training_log.txt")
-    )
+    with open(log_file_path, 'w') as log_file:
-    env.close()
+        sys.stdout = log_file
        model.learn(
            total_timesteps=int(10000000), # total_timesteps = stage_interval * num_envs * num_stages (1120 rounds)
            callback=[checkpoint_callback, stage_increase_callback]
        )
        env.close()
    # Restore stdout
    sys.stdout = original_stdout
    # Save the final model
    model.save(os.path.join(save_dir, "ppo_sf2_ryu_final.zip"))
--- a/004_image_stack_ram_based_reward_custom/tune_ppo.py
+++ b/004_image_stack_ram_based_reward_custom/tune_ppo.py
@ -0,0 +1,68 @@
 import os
 import retro
 import optuna
 from stable_baselines3 import PPO
 from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.evaluation import evaluate_policy
 from street_fighter_custom_wrapper import StreetFighterCustomWrapper
 LOG_DIR = 'logs/'
 OPT_DIR = 'optuna/'
 os.makedirs(LOG_DIR, exist_ok=True)
 os.makedirs(OPT_DIR, exist_ok=True)
 def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 512, 2048, log=True),
        'gamma':trial.suggest_float('gamma', 0.9, 0.9999),
        'learning_rate':trial.suggest_float('learning_rate', 5e-5, 5e-4, log=True),
        'gae_lambda':trial.suggest_float('gae_lambda', 0.8, 0.9999)
    }
 def make_env(game, state):
    def _init():
        env = retro.make(
            game=game, 
            state=state, 
            use_restricted_actions=retro.Actions.FILTERED, 
            obs_type=retro.Observations.IMAGE
        )
        env = StreetFighterCustomWrapper(env)
        return env
    return _init
 def optimize_agent(trial):
    game = "StreetFighterIISpecialChampionEdition-Genesis"
    state = "Champion.Level1.ChunLiVsGuile"#"ChampionX.Level1.ChunLiVsKen"
    try:
        model_params = optimize_ppo(trial) 
        # Create environment 
        env = make_env(game, state)()
        env = Monitor(env, LOG_DIR)
        # Create algo 
        model = PPO('CnnPolicy', env, verbose=1, **model_params)
        model.learn(total_timesteps=500000)
        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=30, deterministic=False)
        env.close()
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        return mean_reward
    except Exception as e:
        return -1
 # Creating the experiment 
 study = optuna.create_study(direction='maximize')
 study.optimize(optimize_agent, n_trials=10, n_jobs=1)
 print(study.best_params)
 print(study.best_trial)
--- a/005_rgb_stack_ram_based_reward_time_penalty/pycache/street_fighter_custom_wrapper.cpython-38.pyc
+++ b/005_rgb_stack_ram_based_reward_time_penalty/pycache/street_fighter_custom_wrapper.cpython-38.pyc
--- a/005_rgb_stack_ram_based_reward_time_penalty/check_reward.py
+++ b/005_rgb_stack_ram_based_reward_time_penalty/check_reward.py
@ -0,0 +1,51 @@
 import os
 import time 
 import retro
 from stable_baselines3.common.monitor import Monitor
 from street_fighter_custom_wrapper import StreetFighterCustomWrapper
 LOG_DIR = 'logs/'
 os.makedirs(LOG_DIR, exist_ok=True)
 def make_env(game, state):
    def _init():
        env = retro.make(
            game=game, 
            state=state, 
            use_restricted_actions=retro.Actions.FILTERED, 
            obs_type=retro.Observations.IMAGE
        )
        env = StreetFighterCustomWrapper(env)
        return env
    return _init
 game = "StreetFighterIISpecialChampionEdition-Genesis"
 state = "Champion.Level1.RyuVsGuile"
 env = make_env(game, state)()
 env = Monitor(env, 'logs/')
 num_episodes = 30
 episode_reward_sum = 0
 for _ in range(num_episodes):
    done = False
    obs = env.reset()
    total_reward = 0
    while not done:
        timestamp = time.time()
        obs, reward, done, info = env.step(env.action_space.sample())
        # Note that if player wins but only has 0 HP left, the winning reward is still 0, so it won't be printed. 
        if reward != 0:
            total_reward += reward
            print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health']))
        env.render()
        # time.sleep(0.005)
    print("Total reward: {}".format(total_reward))
    episode_reward_sum += total_reward
 env.close()
 print("Average reward for random strategy: {}".format(episode_reward_sum/num_episodes))
--- a/005_rgb_stack_ram_based_reward_time_penalty/custom_cnn.py
+++ b/005_rgb_stack_ram_based_reward_time_penalty/custom_cnn.py
@ -0,0 +1,24 @@
 import gym
 import torch
 import torch.nn as nn
 from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
 # Custom feature extractor (CNN)
 class CustomCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space):
        super(CustomCNN, self).__init__(observation_space, features_dim=512)
        self.cnn = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=5, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=5, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(16384, self.features_dim),
            nn.ReLU()
        )
    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.cnn(observations)
--- a/005_rgb_stack_ram_based_reward_time_penalty/evaluate.py
+++ b/005_rgb_stack_ram_based_reward_time_penalty/evaluate.py
@ -0,0 +1,52 @@
 import retro
 from stable_baselines3 import PPO
 from stable_baselines3.common.vec_env import DummyVecEnv
 from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.evaluation import evaluate_policy
 from custom_cnn import CustomCNN
 from street_fighter_custom_wrapper import StreetFighterCustomWrapper
 def make_env(game, state):
    def _init():
        env = retro.make(
            game=game, 
            state=state, 
            use_restricted_actions=retro.Actions.FILTERED, 
            obs_type=retro.Observations.IMAGE    
        )
        env = StreetFighterCustomWrapper(env)
        return env
    return _init
 game = "StreetFighterIISpecialChampionEdition-Genesis"
 state_stages = [
    "Champion.Level1.ChunLiVsGuile",
    "Champion.Level2.ChunLiVsKen",
    "Champion.Level3.ChunLiVsChunLi",
    "Champion.Level4.ChunLiVsZangief",
    "Champion.Level5.ChunLiVsDhalsim",
    "Champion.Level6.ChunLiVsRyu",
    "Champion.Level7.ChunLiVsEHonda",
    "Champion.Level8.ChunLiVsBlanka",
    "Champion.Level9.ChunLiVsBalrog",
    "Champion.Level10.ChunLiVsVega",
    "Champion.Level11.ChunLiVsSagat",
    "Champion.Level12.ChunLiVsBison"
    # Add other stages as necessary
 ]
 env = make_env(game, state_stages[0])()
 # Wrap the environment
 # env = Monitor(env, 'logs/')
 policy_kwargs = {'features_extractor_class': CustomCNN}
 model = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs)
 model = PPO.load(r"dummy_model_ppo_chunli")
 # model.load(r"trained_models/ppo_chunli_864000_steps")
 mean_reward, std_reward = evaluate_policy(model, env, render=True, n_eval_episodes=10, deterministic=False, return_episode_rewards=True)
 print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")
--- a/005_rgb_stack_ram_based_reward_time_penalty/logs/PPO_1/events.out.tfevents.1680427238.DESKTOP-9E17TO7.27420.0
+++ b/005_rgb_stack_ram_based_reward_time_penalty/logs/PPO_1/events.out.tfevents.1680427238.DESKTOP-9E17TO7.27420.0
--- a/005_rgb_stack_ram_based_reward_time_penalty/logs/PPO_2/events.out.tfevents.1680442574.DESKTOP-9E17TO7.8472.0
+++ b/005_rgb_stack_ram_based_reward_time_penalty/logs/PPO_2/events.out.tfevents.1680442574.DESKTOP-9E17TO7.8472.0
--- a/005_rgb_stack_ram_based_reward_time_penalty/logs/PPO_3/events.out.tfevents.1680450538.DESKTOP-9E17TO7.4520.0
+++ b/005_rgb_stack_ram_based_reward_time_penalty/logs/PPO_3/events.out.tfevents.1680450538.DESKTOP-9E17TO7.4520.0
--- a/005_rgb_stack_ram_based_reward_time_penalty/logs/PPO_4/events.out.tfevents.1680493346.DESKTOP-9E17TO7.14264.0
+++ b/005_rgb_stack_ram_based_reward_time_penalty/logs/PPO_4/events.out.tfevents.1680493346.DESKTOP-9E17TO7.14264.0
--- a/005_rgb_stack_ram_based_reward_time_penalty/logs/monitor.csv
+++ b/005_rgb_stack_ram_based_reward_time_penalty/logs/monitor.csv
@ -0,0 +1,72 @@
 #{"t_start": 1680493345.1918354, "env_id": null}
 r,l,t
 121.963397,1218,4.678451
 150.73549,1321,10.670471
 -68.875587,1685,16.915831
 110.079359,1992,23.723121
 -142.422045,2475,30.929999
 25.618847,2409,37.817851
 -10.399282,2048,44.290086
 103.210152,2378,51.321482
 -10.83336,1410,56.861604
 -109.50968,2320,63.936125
 -56.133868,1884,70.48871
 -1.263258,3856,83.55072
 137.096617,1955,89.782694
 -173.590936,2482,96.620559
 -32.06399,2622,103.507519
 -33.546645,2359,110.155689
 247.618832,631,114.254022
 34.18776,1988,121.014232
 -16.0,2372,128.411761
 142.528257,1504,131.040915
 -35.511721,2100,141.725583
 210.669563,1386,144.090119
 106.166972,1756,150.146413
 -291.452157,1771,156.139224
 -103.271598,2993,163.884984
 -3.09159,2881,176.148215
 -69.312432,2968,184.263354
 11.966888,3420,196.801994
 -72.699861,2025,203.564955
 -148.405698,1736,209.933663
 -6.505259,2863,217.596923
 -45.308495,1642,223.866472
 -60.133868,1684,230.124691
 20.320741,1912,236.920414
 12.850028,3201,245.687548
 44.331029,2020,252.592099
 4.970357,2426,260.241496
 -26.074153,3193,272.014878
 41.36973,2057,278.669485
 -58.986068,1951,285.105779
 -6.22689,3090,293.487389
 42.518735,2469,304.130861
 28.699296,1979,310.846102
 -0.123948,2276,318.259012
 -130.860871,3166,326.553931
 -94.074138,2784,337.55811
 -6.779892,1688,340.079623
 -2.846787,1842,346.163562
 -6.846787,2137,352.507397
 74.03875,2905,364.386266
 91.493381,1660,370.441151
 -60.286049,1502,372.846128
 -127.450026,2030,379.306661
 -90.028712,1565,385.293074
 -45.615818,2020,391.765668
 -49.94699,3488,403.627649
 -90.632338,2270,410.494766
 46.011777,3184,419.074744
 -11.461924,2326,429.595886
 181.774886,1260,431.709853
 -40.06399,2863,442.810548
 -90.86073,2232,449.485116
 72.549001,2173,455.957089
 66.832361,1597,461.523406
 29.003218,1923,464.330122
 -57.986068,3347,476.034669
 175.784026,1320,481.626563
 -328.0,1320,483.822258
 -81.578734,2842,495.411107
 72.161772,1818,501.982363
--- a/005_rgb_stack_ram_based_reward_time_penalty/street_fighter_custom_wrapper.py
+++ b/005_rgb_stack_ram_based_reward_time_penalty/street_fighter_custom_wrapper.py
@ -0,0 +1,88 @@
 import math
 import collections
 import gym
 import numpy as np
 # Custom environment wrapper
 class StreetFighterCustomWrapper(gym.Wrapper):
    def __init__(self, env, testing=False):
        super(StreetFighterCustomWrapper, self).__init__(env)
        self.env = env
        # Use a deque to store the last 4 frames
        self.num_frames = 3
        self.frame_stack = collections.deque(maxlen=self.num_frames)
        self.reward_coeff = 1.0
        self.total_timesteps = 0
        self.full_hp = 176
        self.prev_player_health = self.full_hp
        self.prev_oppont_health = self.full_hp
        # Update observation space to include stacked grayscale images
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(100, 128, 3), dtype=np.uint8)
        self.testing = testing
    def _preprocess_observation(self, observation):
        # Stack the downsampled frames.
        self.frame_stack.append(observation[::2, ::2, :])
        # Stack the R, G, B channel of each frame and return the "image".
        stacked_image = np.stack([frame[:, :, i] for i, frame in enumerate(self.frame_stack)], axis=-1)
        return stacked_image
    def reset(self):
        observation = self.env.reset()
        self.prev_player_health = self.full_hp
        self.prev_oppont_health = self.full_hp
        self.total_timesteps = 0
        # Clear the frame stack and add the first observation [num_frames] times
        self.frame_stack.clear()
        for _ in range(self.num_frames):
            self.frame_stack.append(observation[::2, ::2, :])
        return np.stack([frame[:, :, i] for i, frame in enumerate(self.frame_stack)], axis=-1)
    def step(self, action):
        obs, _reward, _done, info = self.env.step(action)
        curr_player_health = info['health']
        curr_oppont_health = info['enemy_health']
        self.total_timesteps += 1
        # Game is over and player loses.
        if curr_player_health < 0:
            custom_reward = -math.pow(self.full_hp, (curr_oppont_health + 1) / (self.full_hp + 1))    # Use the remaining health points of opponent as penalty. 
                                                   # If the opponent also has negative health points, it's a even game and the reward is +1.
            custom_done = True
        # Game is over and player wins.
        elif curr_oppont_health < 0:
            # custom_reward = curr_player_health * self.reward_coeff # Use the remaining health points of player as reward.
                                                                   # Multiply by reward_coeff to make the reward larger than the penalty to avoid cowardice of agent.
            custom_reward = math.pow(self.full_hp, (5940 - self.total_timesteps) / 5940)
            custom_done = True
        # While the fighting is still going on
        else:
            custom_reward = self.reward_coeff * (self.prev_oppont_health - curr_oppont_health) - (self.prev_player_health - curr_player_health)
            self.prev_player_health = curr_player_health
            self.prev_oppont_health = curr_oppont_health
            custom_done = False
        # During testing, the session should always keep going.
        if self.testing:
            custom_done = False
        # Max reward is 6 * full_hp = 1054 (damage * 3 + winning_reward * 3) 
        return self._preprocess_observation(obs), custom_reward, custom_done, info
--- a/005_rgb_stack_ram_based_reward_time_penalty/test.py
+++ b/005_rgb_stack_ram_based_reward_time_penalty/test.py
@ -0,0 +1,76 @@
 import time 
 import retro
 from stable_baselines3 import PPO
 from street_fighter_custom_wrapper import StreetFighterCustomWrapper
 def make_env(game, state):
    def _init():
        env = retro.make(
            game=game, 
            state=state, 
            use_restricted_actions=retro.Actions.FILTERED, 
            obs_type=retro.Observations.IMAGE    
        )
        env = StreetFighterCustomWrapper(env)
        return env
    return _init
 game = "StreetFighterIISpecialChampionEdition-Genesis"
 state_stages = [
    "Champion.Level1.RyuVsGuile",
    "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3 | -20.4
    "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
    "Champion.Level2.ChunLiVsKen",
    "Champion.Level3.ChunLiVsChunLi",
    "Champion.Level4.ChunLiVsZangief",
    "Champion.Level5.ChunLiVsDhalsim",
    "Champion.Level6.ChunLiVsRyu",
    "Champion.Level7.ChunLiVsEHonda",
    "Champion.Level8.ChunLiVsBlanka",
    "Champion.Level9.ChunLiVsBalrog",
    "Champion.Level10.ChunLiVsVega",
    "Champion.Level11.ChunLiVsSagat",
    "Champion.Level12.ChunLiVsBison"
    # Add other stages as necessary
 ]
 env = make_env(game, state_stages[0])()
 model = PPO(
    "CnnPolicy", 
    env,
    verbose=1
 )
 model_path = r"trained_models_ryu_level_1_reward_x3/ppo_ryu_6600000_steps"
 model.load(model_path)
 # Average reward for optuna/trial_1_best_model: -82.3
 # Average reward for optuna/trial_9_best_model: 36.7 | -86.23
 # Average reward for trained_models/ppo_chunli_5376000_steps: -77.8
 obs = env.reset()
 done = False
 num_episodes = 30
 episode_reward_sum = 0
 for _ in range(num_episodes):
    done = False
    obs = env.reset()
    total_reward = 0
    while True:
        timestamp = time.time()
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        if reward != 0:
            total_reward += reward
            print("Reward: {}, playerHP: {}, enemyHP:{}".format(reward, info['health'], info['enemy_health']))
        env.render()
        # time.sleep(0.005)
    # print("Total reward: {}".format(total_reward))
    # episode_reward_sum += total_reward
 # env.close()
 # print("Average reward for {}: {}".format(model_path, episode_reward_sum/num_episodes))
--- a/005_rgb_stack_ram_based_reward_time_penalty/train.py
+++ b/005_rgb_stack_ram_based_reward_time_penalty/train.py
@ -0,0 +1,151 @@
 import os
 import random
 import retro
 from stable_baselines3 import PPO
 from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
 from street_fighter_custom_wrapper import StreetFighterCustomWrapper
 LOG_DIR = 'logs'
 os.makedirs(LOG_DIR, exist_ok=True)
 class RandomOpponentChangeCallback(BaseCallback):
    def __init__(self, stages, opponent_interval, verbose=0):
        super(RandomOpponentChangeCallback, self).__init__(verbose)
        self.stages = stages
        self.opponent_interval = opponent_interval
    def _on_step(self) -> bool:
        if self.n_calls % self.opponent_interval == 0:
            new_state = random.choice(self.stages)
            print("\nCurrent state:", new_state)
            self.training_env.env_method("load_state", new_state, indices=None)
        return True
 # class StageIncreaseCallback(BaseCallback):
 #     def __init__(self, stages, stage_interval, save_dir, verbose=0):
 #         super(StageIncreaseCallback, self).__init__(verbose)
 #         self.stages = stages
 #         self.stage_interval = stage_interval
 #         self.save_dir = save_dir
 #         self.current_stage = 0
 #     def _on_step(self) -> bool:
 #         if self.n_calls % self.stage_interval == 0 and self.current_stage < len(self.stages) - 1:
 #             self.current_stage += 1
 #             new_state = self.stages[self.current_stage]
 #             self.training_env.env_method("load_state", new_state, indices=None)
 #             self.model.save(os.path.join(self.save_dir, f"ppo_chunli_stage_{self.current_stage}.zip"))
 #         return True
 def make_env(game, state):
    def _init():
        env = retro.make(
            game=game, 
            state=state, 
            use_restricted_actions=retro.Actions.FILTERED, 
            obs_type=retro.Observations.IMAGE    
        )
        env = StreetFighterCustomWrapper(env)
        return env
    return _init
 def main():
    # Set up the environment and model
    game = "StreetFighterIISpecialChampionEdition-Genesis"
    state_stages = [
        "Champion.Level1.RyuVsGuile",
        "Champion.Level1.ChunLiVsGuile", # Average reward for random strategy: -102.3
        "Champion.Level2.ChunLiVsKen",
        "Champion.Level3.ChunLiVsChunLi",
        "Champion.Level4.ChunLiVsZangief",
        "Champion.Level5.ChunLiVsDhalsim",
        "Champion.Level6.ChunLiVsRyu",
        "Champion.Level7.ChunLiVsEHonda",
        "Champion.Level8.ChunLiVsBlanka",
        "Champion.Level9.ChunLiVsBalrog",
        "Champion.Level10.ChunLiVsVega",
        "Champion.Level11.ChunLiVsSagat",
        "Champion.Level12.ChunLiVsBison"
        # Add other stages as necessary
    ]
    # state_stages = [
    #     "ChampionX.Level1.ChunLiVsKen", # Average reward for random strategy: -247.6
    #     "ChampionX.Level2.ChunLiVsChunLi",
    #     "ChampionX.Level3.ChunLiVsZangief",
    #     "ChampionX.Level4.ChunLiVsDhalsim",
    #     "ChampionX.Level5.ChunLiVsRyu",
    #     "ChampionX.Level6.ChunLiVsEHonda",
    #     "ChampionX.Level7.ChunLiVsBlanka",
    #     "ChampionX.Level8.ChunLiVsGuile",
    #     "ChampionX.Level9.ChunLiVsBalrog",
    #     "ChampionX.Level10.ChunLiVsVega",
    #     "ChampionX.Level11.ChunLiVsSagat",
    #     "ChampionX.Level12.ChunLiVsBison"
    #     # Add other stages as necessary
    # ]
    # Champion is at difficulty level 4, ChampionX is at difficulty level 8.
    env = make_env(game, state_stages[0])()
    # Warp env in Monitor wrapper to record training progress
    env = Monitor(env, LOG_DIR)
    model = PPO(
        "CnnPolicy", 
        env,
        device="cuda",
        verbose=1,
        n_steps=2048,
        batch_size=64,
        learning_rate=1e-4,
        gamma=0.99,
        tensorboard_log="logs"
    )
    # Set the save directory
    save_dir = "trained_models_ryu_level_1_time_reward"
    os.makedirs(save_dir, exist_ok=True)
    # Load the model from file
    # model_path = "trained_models/ppo_chunli_1296000_steps.zip"
    # Load model and modify the learning rate and entropy coefficient
    # custom_objects = {
    #     "learning_rate": 0.0002
    # }
    # model = PPO.load(model_path, env=env, device="cuda")#, custom_objects=custom_objects)
    # Set up callbacks
    # opponent_interval = 35840 # stage_interval * num_envs = total_steps_per_stage
    checkpoint_interval = 200000 # checkpoint_interval * num_envs = total_steps_per_checkpoint (Every 80 rounds)
    checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path=save_dir, name_prefix="ppo_ryu")
    # stage_increase_callback = RandomOpponentChangeCallback(state_stages, opponent_interval, save_dir)
    # model_params = {
    #     'n_steps': 5, 
    #     'gamma': 0.99, 
    #     'gae_lambda':1, 
    #     'learning_rate': 7e-4, 
    #     'vf_coef': 0.5,
    #     'ent_coef': 0.0,
    #     'max_grad_norm':0.5,
    #     'rms_prop_eps':1e-05 
    # }
    # model = A2C('CnnPolicy', env, tensorboard_log='logs/', verbose=1, **model_params, policy_kwargs=dict(optimizer_class=RMSpropTF))
    model.learn(
        total_timesteps=int(10000000), # total_timesteps = stage_interval * num_envs * num_stages (1120 rounds)
        callback=[checkpoint_callback]#, stage_increase_callback]
    )
    env.close()
    # Save the final model
    model.save(os.path.join(save_dir, "ppo_sf2_ryu_final.zip"))
 if __name__ == "__main__":
    main()
--- a/005_rgb_stack_ram_based_reward_time_penalty/tune_ppo.py
+++ b/005_rgb_stack_ram_based_reward_time_penalty/tune_ppo.py
@ -0,0 +1,68 @@
 import os
 import retro
 import optuna
 from stable_baselines3 import PPO
 from stable_baselines3.common.monitor import Monitor
 from stable_baselines3.common.evaluation import evaluate_policy
 from street_fighter_custom_wrapper import StreetFighterCustomWrapper
 LOG_DIR = 'logs/'
 OPT_DIR = 'optuna/'
 os.makedirs(LOG_DIR, exist_ok=True)
 os.makedirs(OPT_DIR, exist_ok=True)
 def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 512, 2048, log=True),
        'gamma':trial.suggest_float('gamma', 0.9, 0.9999),
        'learning_rate':trial.suggest_float('learning_rate', 5e-5, 5e-4, log=True),
        'gae_lambda':trial.suggest_float('gae_lambda', 0.8, 0.9999)
    }
 def make_env(game, state):
    def _init():
        env = retro.make(
            game=game, 
            state=state, 
            use_restricted_actions=retro.Actions.FILTERED, 
            obs_type=retro.Observations.IMAGE
        )
        env = StreetFighterCustomWrapper(env)
        return env
    return _init
 def optimize_agent(trial):
    game = "StreetFighterIISpecialChampionEdition-Genesis"
    state = "Champion.Level1.ChunLiVsGuile"#"ChampionX.Level1.ChunLiVsKen"
    try:
        model_params = optimize_ppo(trial) 
        # Create environment 
        env = make_env(game, state)()
        env = Monitor(env, LOG_DIR)
        # Create algo 
        model = PPO('CnnPolicy', env, verbose=1, **model_params)
        model.learn(total_timesteps=500000)
        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=30, deterministic=False)
        env.close()
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        return mean_reward
    except Exception as e:
        return -1
 # Creating the experiment 
 study = optuna.create_study(direction='maximize')
 study.optimize(optimize_agent, n_trials=10, n_jobs=1)
 print(study.best_params)
 print(study.best_trial)