auto-trading/notebooks/data_preprocessing.py

# %% Import required packages
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# %% Load data
# Assume that we have a CSV file in the raw data folder
data = pd.read_csv('../data/raw/raw_data.csv')

# %% Display the first few rows of the data
# It's always a good idea to take a look at the data before starting preprocessing.
print(data.head())

# %% Handle missing values
# This will depend on your specific dataset. Here we'll simply remove rows with missing values.
data = data.dropna()

# %% Convert date column to datetime
# If your dataset includes a date column, it's a good idea to convert it to datetime format for time series analysis.
data['Date'] = pd.to_datetime(data['Date'])

# %% Set date as index
# For time series analysis, it can be useful to set the date column as the index of the DataFrame.
data = data.set_index('Date')

# %% Normalize data (optional)
# If your models require normalized data, you can use MinMaxScaler or another normalization technique.
scaler = MinMaxScaler()
data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)

# %% Split data into training and testing sets
# It's important to split your data into training and testing sets to evaluate the performance of your models.
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=False)

# %% Save processed data
# Finally, save your processed data for further use.
train_data.to_csv('../data/processed/train_data.csv')
test_data.to_csv('../data/processed/test_data.csv')