# %% Import required packages import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split # %% Load data # Assume that we have a CSV file in the raw data folder data = pd.read_csv('../data/raw/raw_data.csv') # %% Display the first few rows of the data # It's always a good idea to take a look at the data before starting preprocessing. print(data.head()) # %% Handle missing values # This will depend on your specific dataset. Here we'll simply remove rows with missing values. data = data.dropna() # %% Convert date column to datetime # If your dataset includes a date column, it's a good idea to convert it to datetime format for time series analysis. data['Date'] = pd.to_datetime(data['Date']) # %% Set date as index # For time series analysis, it can be useful to set the date column as the index of the DataFrame. data = data.set_index('Date') # %% Normalize data (optional) # If your models require normalized data, you can use MinMaxScaler or another normalization technique. scaler = MinMaxScaler() data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index) # %% Split data into training and testing sets # It's important to split your data into training and testing sets to evaluate the performance of your models. train_data, test_data = train_test_split(data, test_size=0.2, shuffle=False) # %% Save processed data # Finally, save your processed data for further use. train_data.to_csv('../data/processed/train_data.csv') test_data.to_csv('../data/processed/test_data.csv')