39 lines
1.6 KiB
Python
39 lines
1.6 KiB
Python
# %% Import required packages
|
|
import pandas as pd
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
# %% Load data
|
|
# Assume that we have a CSV file in the raw data folder
|
|
data = pd.read_csv('../data/raw/raw_data.csv')
|
|
|
|
# %% Display the first few rows of the data
|
|
# It's always a good idea to take a look at the data before starting preprocessing.
|
|
print(data.head())
|
|
|
|
# %% Handle missing values
|
|
# This will depend on your specific dataset. Here we'll simply remove rows with missing values.
|
|
data = data.dropna()
|
|
|
|
# %% Convert date column to datetime
|
|
# If your dataset includes a date column, it's a good idea to convert it to datetime format for time series analysis.
|
|
data['Date'] = pd.to_datetime(data['Date'])
|
|
|
|
# %% Set date as index
|
|
# For time series analysis, it can be useful to set the date column as the index of the DataFrame.
|
|
data = data.set_index('Date')
|
|
|
|
# %% Normalize data (optional)
|
|
# If your models require normalized data, you can use MinMaxScaler or another normalization technique.
|
|
scaler = MinMaxScaler()
|
|
data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
|
|
|
|
# %% Split data into training and testing sets
|
|
# It's important to split your data into training and testing sets to evaluate the performance of your models.
|
|
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=False)
|
|
|
|
# %% Save processed data
|
|
# Finally, save your processed data for further use.
|
|
train_data.to_csv('../data/processed/train_data.csv')
|
|
test_data.to_csv('../data/processed/test_data.csv')
|