auto-trading/notebooks/data_exploration.py

77 lines
2.8 KiB
Python

# %% Import required packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %% Load data
# Assume that we have a CSV file in the processed data folder
data = pd.read_csv('../data/processed/processed_data.csv')
# %% Display the first few rows of the data
# This gives a snapshot of the data and its structure.
print(data.head())
# %% Display data summary
# This gives statistical details of the data like mean, standard deviation, etc.
print(data.describe())
# %% Check for missing values
# Missing values can affect the performance of the model and should be handled appropriately.
print(data.isnull().sum())
# %% Visualize the closing prices
# Plotting the data helps in understanding the trend and seasonality in the data.
plt.figure(figsize=(14, 7))
plt.plot(data['Close'])
plt.title('Closing Prices Over Time')
plt.xlabel('Time')
plt.ylabel('Price')
plt.show()
# %% Display the distribution of daily returns
# This can give an idea about the volatility of the stock.
daily_returns = data['Close'].pct_change().dropna()
sns.histplot(daily_returns, bins=50, kde=True)
plt.title('Distribution of Daily Returns')
plt.show()
# %% Display correlation between different features
# Correlation can indicate if there are any dependent relationships between the variables.
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()
# %% Display a scatter plot of volume vs closing price
# Scatter plot can show the relationship between two variables.
plt.scatter(data['Volume'], data['Close'])
plt.title('Volume vs Closing Price')
plt.xlabel('Volume')
plt.ylabel('Closing Price')
plt.show()
# %% Display time series decomposition if applicable
# Time series decomposition can help in understanding the trend, seasonality, and noise in the data.
# Please note that this requires statsmodels library.
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(data['Close'], model='multiplicative', period=252) # Assume that period is 252 for trading days in a year
decomposed.plot()
plt.show()
# %% Display moving averages
# Moving averages can help in understanding the trend in the data over different time periods.
data['Close'].rolling(window=7).mean().plot(label='7 Day Average')
data['Close'].rolling(window=30).mean().plot(label='30 Day Average')
data['Close'].rolling(window=90).mean().plot(label='90 Day Average')
plt.legend()
plt.title('Moving Averages of Closing Prices')
plt.show()
# %% Display Autocorrelation plot
# Autocorrelation can show if the data is random or if there is a pattern.
# Please note that this requires pandas.plotting library.
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(data['Close'])
plt.title('Autocorrelation of Closing Prices')
plt.show()