First version, not tested yet.

2023-05-27 18:41:58 +08:00 · 2023-05-27 18:41:58 +08:00 · 81e272763e
commit 81e272763e
parent 005ddfed49
28 changed files with 660 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,72 @@
-# transformer-trading
+# AutoTradingSystem
 Trial project for deep learning trading model with ChatGPT-4
 This project is an automatic trading system based on a Transformer and Reinforcement Learning hybrid model.
 Trial project for deep learning trading model with ChatGPT-4
 ## Setup
 1. Install the required packages: `pip install -r requirements.txt`
 2. Run the main script: `python src/main.py`
 ## Structure
 trading-system/
 │
 ├── data/
 │   ├── raw/           # Raw data files
 │   └── processed/     # Processed data files
 │
 ├── models/            # Trained models and model checkpoints
 │
 ├── logs/              # Training logs, evaluation results, etc.
 │
 ├── notebooks/         # Jupyter notebooks
 │   ├── data_exploration.ipynb
 │   ├── model_training.ipynb
 │   ├── model_evaluation.ipynb
 │   └── demo.ipynb
 │
 ├── src/
 │   ├── data/          # Data-related modules
 │   │   ├── __init__.py
 │   │   ├── data_collection.py
 │   │   └── data_preprocessing.py
 │   │
 │   ├── models/        # Model-related modules
 │   │   ├── __init__.py
 │   │   ├── transformer_model.py
 │   │   ├── rl_model.py
 │   │   └── trading_agent.py
 │   │
 │   ├── training/      # Training-related modules
 │   │   ├── __init__.py
 │   │   └── train.py
 │   │
 │   ├── evaluation/    # Evaluation-related modules
 │   │   ├── __init__.py
 │   │   └── evaluate.py
 │   │
 │   ├── utils/         # Utility modules
 │   │   ├── __init__.py
 │   │   ├── metrics.py
 │   │   └── utils.py
 │   │
 │   └── main.py        # Main entry point for the project
 │
 ├── tests/             # Test-related modules
 │   ├── __init__.py
 │   ├── test_data_collection.py
 │   ├── test_data_preprocessing.py
 │   ├── test_transformer_model.py
 │   ├── test_rl_model.py
 │   ├── test_trading_model.py
 │   └── test_metrics.py
 │
 ├── requirements.txt   # Required Python packages
 │
 └── README.md          # Project documentation
 Trial project for deep learning trading model with ChatGPT-4
--- a/notebooks/data_exploration.py
+++ b/notebooks/data_exploration.py
@ -0,0 +1,51 @@
 # %% Import required packages
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 # %% Load data
 # Assume that we have a CSV file in the processed data folder
 data = pd.read_csv('./data/processed/processed_data.csv')
 # %% Display the first few rows of the data
 print(data.head())
 # %% Display data summary
 print(data.describe())
 # %% Check for missing values
 print(data.isnull().sum())
 # %% Visualize the closing prices
 plt.figure(figsize=(14, 7))
 plt.plot(data['Close'])
 plt.title('Closing Prices Over Time')
 plt.xlabel('Time')
 plt.ylabel('Price')
 plt.show()
 # %% Display the distribution of daily returns
 daily_returns = data['Close'].pct_change().dropna()
 sns.histplot(daily_returns, bins=50, kde=True)
 plt.title('Distribution of Daily Returns')
 plt.show()
 # %% Display correlation between different features
 correlation_matrix = data.corr()
 sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
 plt.title('Correlation Matrix of Features')
 plt.show()
 # %% Display a scatter plot of volume vs closing price
 plt.scatter(data['Volume'], data['Close'])
 plt.title('Volume vs Closing Price')
 plt.xlabel('Volume')
 plt.ylabel('Closing Price')
 plt.show()
 # %% Display time series decomposition if applicable
 # You might need to install and import statsmodels for this
 # from statsmodels.tsa.seasonal import seasonal_decompose
 # decomposed = seasonal_decompose(data['Close'], model='multiplicative', period=252)  # Assume that period is 252 for trading days in a year
 # decomposed.plot()
 # plt.show()
--- a/notebooks/demo.py
+++ b/notebooks/demo.py
--- a/notebooks/model_evaluation.py
+++ b/notebooks/model_evaluation.py
@ -0,0 +1,35 @@
 # %% Import required packages
 import torch
 from src.models.transformer_model import TransformerModel
 from src.models.rl_model import RLModel
 from src.models.trading_agent import TradingAgent
 from src.evaluation.evaluate import evaluate_trading_agent
 from src.data.data_preprocessing import load_processed_data
 # %% Set device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # %% Load processed data
 data = load_processed_data('./data/processed/processed_data.csv')
 # %% Initialize models
 transformer_model = TransformerModel().to(device)
 rl_model = RLModel().to(device)
 trading_agent = TradingAgent(transformer_model, rl_model)
 # %% Load model weights
 transformer_model.load_state_dict(torch.load('./models/transformer_model.pth'))
 rl_model.load_state_dict(torch.load('./models/rl_model.pth'))
 # %% Evaluate the trading agent
 trading_agent_results = evaluate_trading_agent(trading_agent, data)
 # %% Display evaluation results
 print("Total Profit: ", trading_agent_results['total_profit'])
 print("Total Trades Made: ", trading_agent_results['total_trades'])
 print("Successful Trades: ", trading_agent_results['successful_trades'])
 # %% Save evaluation results
 with open('./logs/evaluation_results.txt', 'w') as f:
    for key, value in trading_agent_results.items():
        f.write(f'{key}: {value}\n')
--- a/notebooks/model_training.py
+++ b/notebooks/model_training.py
@ -0,0 +1,46 @@
 # %% Import required packages
 import torch
 from src.models.transformer_model import TransformerModel
 from src.models.rl_model import RLModel
 from src.models.trading_agent import TradingAgent
 from src.training.train import train_transformer, train_rl
 from src.data.data_preprocessing import load_processed_data
 # %% Set device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # %% Load processed data
 data = load_processed_data('./data/processed/processed_data.csv')
 # %% Initialize models
 transformer_model = TransformerModel().to(device)
 rl_model = RLModel().to(device)
 trading_agent = TradingAgent(transformer_model, rl_model)
 # %% Train Transformer Model
 # Set the appropriate hyperparameters
 transformer_model_hyperparams = {
    "epochs": 10,
    "batch_size": 32,
    "learning_rate": 0.001,
 }
 train_transformer(transformer_model, data, transformer_model_hyperparams)
 # %% Save Transformer Model
 torch.save(transformer_model.state_dict(), './models/transformer_model.pth')
 # %% Train RL Model
 # Set the appropriate hyperparameters
 rl_model_hyperparams = {
    "epochs": 500,
    "batch_size": 32,
    "learning_rate": 0.001,
    "gamma": 0.99,  # discount factor
    "epsilon_start": 1.0,  # exploration rate at the beginning
    "epsilon_end": 0.01,  # minimum exploration rate
    "epsilon_decay": 0.995,  # exponential decay rate for exploration probability
 }
 train_rl(trading_agent, data, rl_model_hyperparams)
 # %% Save RL Model
 torch.save(rl_model.state_dict(), './models/rl_model.pth')
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
 numpy
 pandas
 torch
 transformers
 gym
 stable-baselines3
 yfinance
 matplotlib
--- a/src/data/init.py
+++ b/src/data/init.py
--- a/src/data/data_collection.py
+++ b/src/data/data_collection.py
@ -0,0 +1,19 @@
 import yfinance as yf
 def collect_data(tickers, start_date, end_date):
    """
    Collects data for the given tickers and date range.
    Parameters:
    tickers (list of str): List of ticker symbols.
    start_date (str): Start date in format 'YYYY-MM-DD'.
    end_date (str): End date in format 'YYYY-MM-DD'.
    Returns:
    dict: Dictionary where the keys are ticker symbols and the values are pandas DataFrames of the price data.
    """
    data = {}
    for ticker in tickers:
        df = yf.download(ticker, start=start_date, end=end_date)
        data[ticker] = df
    return data
--- a/src/data/data_preprocessing.py
+++ b/src/data/data_preprocessing.py
@ -0,0 +1,16 @@
 from sklearn.preprocessing import MinMaxScaler
 def preprocess_data(data):
    """
    Preprocesses the collected data.
    Parameters:
    data (dict): The data collected from collect_data function. Keys are tickers and values are pandas DataFrames.
    Returns:
    dict: Preprocessed data where the 'Close' prices have been scaled to be between 0 and 1.
    """
    scaler = MinMaxScaler()
    for ticker in data:
        data[ticker]['Close'] = scaler.fit_transform(data[ticker][['Close']])
    return data
--- a/src/evaluation/init.py
+++ b/src/evaluation/init.py
--- a/src/evaluation/evaluate.py
+++ b/src/evaluation/evaluate.py
@ -0,0 +1,68 @@
 import torch
 from torch.utils.data import DataLoader
 from src.models.transformer_model import TransformerModel
 from src.models.rl_model import RLModel
 from src.data.data_preprocessing import Dataset
 def evaluate_transformer_model(transformer_model, test_data):
    """
    Evaluate the Transformer model.
    Parameters:
    transformer_model (TransformerModel): The transformer model to evaluate.
    test_data (Dataset): The test data.
    """
    # Create data loader
    dataloader = DataLoader(test_data, batch_size=32, shuffle=False)
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transformer_model.to(device)
    # Define loss function
    criterion = torch.nn.CrossEntropyLoss()
    # Evaluation mode
    transformer_model.eval()
    # Evaluation loop
    with torch.no_grad():
        total_loss = 0
        for i, (inputs, targets) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)
            # Forward pass
            outputs = transformer_model(inputs)
            # Compute loss
            loss = criterion(outputs, targets)
            total_loss += loss.item()
        # Compute average loss
        average_loss = total_loss / len(dataloader)
        print(f'Average loss: {average_loss}')
 def evaluate_rl_model(rl_model, env, episodes):
    """
    Evaluate the RL model.
    Parameters:
    rl_model (RLModel): The RL model to evaluate.
    env (gym.Env): The Gym environment to use for evaluation.
    episodes (int): The number of episodes to evaluate for.
    """
    total_rewards = 0
    for i_episode in range(episodes):
        state = env.reset()
        done = False
        while not done:
            action = rl_model.predict(state)
            state, reward, done, _ = env.step(action)
            total_rewards += reward
        print(f'Episode: {i_episode+1}, Reward: {reward}')
    # Compute average reward
    average_reward = total_rewards / episodes
    print(f'Average reward: {average_reward}')
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,40 @@
 import argparse
 from src.data import data_collection, data_preprocessing
 from src.models import transformer_model, rl_model, trading_model
 from src.training import train
 from src.evaluation import evaluate
 from src.utils import utils, metrics
 def main(args):
    # Set seed for reproducibility
    utils.seed_everything(args.seed)
    # Data Collection
    raw_data = data_collection.collect_data(args.data_source)
    # Data Preprocessing
    processed_data = data_preprocessing.preprocess_data(raw_data)
    # Model Creation
    transformer = transformer_model.TransformerModel(args.transformer_config)
    rl_agent = rl_model.RLModel(args.rl_config)
    trading_agent = trading_model.TradingAgent(transformer, rl_agent)
    # Model Training
    train.train(trading_agent, processed_data, args.training_config)
    # Model Evaluation
    evaluation_results = evaluate.evaluate(trading_agent, processed_data, metrics)
    print(evaluation_results)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility.")
    parser.add_argument("--data_source", type=str, default="data/raw/", help="Data source for the trading data.")
    parser.add_argument("--transformer_config", type=str, default="config/transformer.json", help="Path to the Transformer model configuration file.")
    parser.add_argument("--rl_config", type=str, default="config/rl.json", help="Path to the RL model configuration file.")
    parser.add_argument("--training_config", type=str, default="config/training.json", help="Path to the training configuration file.")
    args = parser.parse_args()
    main(args)
--- a/src/models/init.py
+++ b/src/models/init.py
--- a/src/models/rl_model.py
+++ b/src/models/rl_model.py
@ -0,0 +1,35 @@
 from stable_baselines3 import PPO
 from stable_baselines3.common.envs import DummyVecEnv
 class RLModel:
    def __init__(self, env):
        """
        Initializes the RLModel with a given environment.
        Parameters:
        env (gym.Env): The Gym environment to use for training.
        """
        self.env = DummyVecEnv([lambda: env])  # The environment must be vectorized
        self.model = PPO('MlpPolicy', self.env, verbose=1)
    def train(self, timesteps):
        """
        Trains the model for a given number of timesteps.
        Parameters:
        timesteps (int): The number of timesteps to train for.
        """
        self.model.learn(total_timesteps=timesteps)
    def predict(self, obs):
        """
        Makes a prediction based on the given observations.
        Parameters:
        obs (np.array): The observations to base the prediction on.
        Returns:
        np.array: The action predicted by the model.
        """
        action, _states = self.model.predict(obs)
        return action
--- a/src/models/trading_agent.py
+++ b/src/models/trading_agent.py
@ -0,0 +1,33 @@
 class TradingAgent:
    def __init__(self, transformer_model, rl_model):
        """
        Initializes the TradingAgent with the Transformer and RL models.
        Parameters:
        transformer_model (TransformerModel): The Transformer model to use for predictions.
        rl_model (RLModel): The RL model to use for predictions.
        """
        self.transformer_model = transformer_model
        self.rl_model = rl_model
    def make_decision(self, text, obs):
        """
        Makes a trading decision based on the given text and observations.
        Parameters:
        text (str): The text to feed to the Transformer model.
        obs (np.array): The observations to feed to the RL model.
        Returns:
        int: The action chosen by the agent (0: hold, 1: buy, 2: sell).
        """
        # Get embeddings from transformer model
        embeddings = self.transformer_model.get_embeddings(text)
        # Combine embeddings with observations
        combined_input = np.concatenate((embeddings.detach().numpy(), obs))
        # Get action from RL model
        action = self.rl_model.predict(combined_input)
        return action
--- a/src/models/transformer_model.py
+++ b/src/models/transformer_model.py
@ -0,0 +1,20 @@
 from transformers import BertModel, BertTokenizer
 class TransformerModel:
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
        self.model = BertModel.from_pretrained(pretrained_model_name)
    def get_embeddings(self, text):
        """
        Returns the embeddings generated by the transformer model.
        Parameters:
        text (str): Text to get embeddings for.
        Returns:
        torch.Tensor: Embeddings for the input text.
        """
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        outputs = self.model(**inputs)
        return outputs.last_hidden_state
--- a/src/training/init.py
+++ b/src/training/init.py
--- a/src/training/train.py
+++ b/src/training/train.py
@ -0,0 +1,57 @@
 import torch
 from torch.utils.data import DataLoader
 from src.models.transformer_model import TransformerModel
 from src.models.rl_model import RLModel
 from src.data.data_preprocessing import Dataset
 def train_transformer_model(transformer_model, train_data, epochs, learning_rate):
    """
    Train the Transformer model.
    Parameters:
    transformer_model (TransformerModel): The transformer model to train.
    train_data (Dataset): The training data.
    epochs (int): The number of epochs to train for.
    learning_rate (float): The learning rate for the optimizer.
    """
    # Create data loader
    dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transformer_model.to(device)
    # Define loss function and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(transformer_model.parameters(), lr=learning_rate)
    # Training loop
    for epoch in range(epochs):
        for i, (inputs, targets) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)
            # Forward pass
            outputs = transformer_model(inputs)
            # Compute loss
            loss = criterion(outputs, targets)
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Print loss every 100 batches
            if i % 100 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item()}')
 def train_rl_model(rl_model, env, timesteps):
    """
    Train the RL model.
    Parameters:
    rl_model (RLModel): The RL model to train.
    env (gym.Env): The Gym environment to use for training.
    timesteps (int): The number of timesteps to train for.
    """
    rl_model.train(timesteps)
--- a/src/utils/init.py
+++ b/src/utils/init.py
--- a/src/utils/metrics.py
+++ b/src/utils/metrics.py
@ -0,0 +1,36 @@
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 def compute_classification_metrics(y_true, y_pred):
    """
    Compute classification metrics.
    Parameters:
    y_true (np.array): Ground truth labels.
    y_pred (np.array): Predicted labels.
    Returns:
    dict: A dictionary containing the computed metrics.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}
 def compute_reward_metrics(total_rewards, num_episodes):
    """
    Compute reward metrics.
    Parameters:
    total_rewards (list): List of total rewards per episode.
    num_episodes (int): Total number of episodes.
    Returns:
    dict: A dictionary containing the computed metrics.
    """
    average_reward = sum(total_rewards) / num_episodes
    max_reward = max(total_rewards)
    min_reward = min(total_rewards)
    return {'average_reward': average_reward, 'max_reward': max_reward, 'min_reward': min_reward}
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@ -0,0 +1,52 @@
 import numpy as np
 import torch
 from sklearn.preprocessing import MinMaxScaler
 def seed_everything(seed):
    """
    Set a seed for all random number generators to ensure reproducibility.
    Parameters:
    seed (int): The seed to use.
    """
    np.random.seed(seed)
    torch.manual_seed(seed)
 def scale_data(data):
    """
    Scale data using MinMaxScaler.
    Parameters:
    data (np.array): The data to scale.
    Returns:
    np.array: The scaled data.
    """
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data, scaler
 def save_model(model, path):
    """
    Save a PyTorch model.
    Parameters:
    model (torch.nn.Module): The model to save.
    path (str): The path where to save the model.
    """
    torch.save(model.state_dict(), path)
 def load_model(model, path):
    """
    Load a PyTorch model.
    Parameters:
    model (torch.nn.Module): The model to load.
    path (str): The path from where to load the model.
    Returns:
    torch.nn.Module: The loaded model.
    """
    model.load_state_dict(torch.load(path))
    model.eval()
    return model
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_data_collection.py
+++ b/tests/test_data_collection.py
@ -0,0 +1,13 @@
 import pytest
 from src.data import data_collection
 def test_collect_data():
    # Test the collect_data function
    data = data_collection.collect_data('path_to_test_data')
    # Check that the data has the expected shape
    assert data.shape == (expected_number_of_rows, expected_number_of_columns)
    # Check that the data has the expected columns
    expected_columns = ['column1', 'column2', 'column3']
    assert all(column in data.columns for column in expected_columns)
--- a/tests/test_data_preprocessing.py
+++ b/tests/test_data_preprocessing.py
@ -0,0 +1,26 @@
 import pytest
 import pandas as pd
 from src.data import data_preprocessing
 def test_preprocess_data():
    # create a mock data
    raw_data = pd.DataFrame({
        'Open': [1.0, 2.0, 3.0, 4.0, 5.0],
        'High': [1.1, 2.1, 3.1, 4.1, 5.1],
        'Low': [0.9, 1.9, 2.9, 3.9, 4.9],
        'Close': [1.0, 2.0, 3.0, 4.0, 5.0],
        'Volume': [1000, 2000, 3000, 4000, 5000]
    })
    # perform preprocessing
    processed_data = data_preprocessing.preprocess_data(raw_data)
    # check that the data has the expected columns
    expected_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    assert all(column in processed_data.columns for column in expected_columns)
    # check the shape of the data
    assert processed_data.shape == raw_data.shape
    # check that values are normalized (within a certain range, e.g. -1.0 to 1.0)
    assert all(-1.0 <= value <= 1.0 for value in processed_data.values.flatten())
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
--- a/tests/test_rl_model.py
+++ b/tests/test_rl_model.py
--- a/tests/test_trading_model.py
+++ b/tests/test_trading_model.py
--- a/tests/test_transformer_model.py
+++ b/tests/test_transformer_model.py
@ -0,0 +1,34 @@
 import pytest
 import torch
 from src.models import transformer_model
 def test_transformer_model():
    # Create mock input data
    input_ids = torch.randint(0, 100, (1, 20))
    attention_mask = torch.ones((1, 20))
    # Instantiate the model
    model = transformer_model.TransformerModel()
    # Forward pass
    outputs = model(input_ids, attention_mask)
    # Check output dimensions
    assert outputs.size() == torch.Size([1, 20, model.hidden_size])
    # Check that the model is on the correct device
    assert outputs.device == model.device
 def test_model_save_load():
    # Instantiate the model
    model = transformer_model.TransformerModel()
    # Save the model
    model.save_pretrained('test_model')
    # Load the model
    loaded_model = transformer_model.TransformerModel.from_pretrained('test_model')
    # Check that the loaded model has the same parameters as the original model
    for p1, p2 in zip(model.parameters(), loaded_model.parameters()):
        assert torch.all(p1.eq(p2))