Predicting Stock Prices with LSTMs¶

Setup¶

In [1]:

            
                Copied!
                
                    
                    
                
                

        
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

Dataset¶

CAC40¶

CAC40, previously known as Bourse de Paris, is a benchmark French stock market index. The index represents a capitalization-weighted measure of the 40 most significant stocks among the 100 largest market caps on the Euronext Paris. Its acronym stands for Cotation Assistée en Continu, which translates to continuous assisted trading, and is used as a benchmark index for funds investing in the French stock market.

Reference: https://www.kaggle.com/datasets/bryanb/cac40-stocks-dataset

In [2]:

            
                Copied!
                
df = pd.read_csv('preprocessed_CAC40.csv').drop(columns=['Unnamed: 0'])
df = pd.read_csv('preprocessed_CAC40.csv').drop(columns=['Unnamed: 0'])

In [3]:

            
                Copied!
                
df.shape
df.shape

Out[3]:

(97648, 7)

In [4]:

            
                Copied!
                
df.head()
df.head()

Out[4]:

	Name	Date	Open	Closing_Price	Daily_High	Daily_Low	Volume
0	Accor	2020-04-03	22.99	23.40	23.40	22.99	67
1	Accor	2020-04-02	23.91	22.99	23.91	22.99	250
2	Accor	2020-04-01	24.10	23.83	24.10	23.83	37
3	Accor	2020-03-31	25.04	25.00	25.24	24.99	336
4	Accor	2020-03-30	26.50	25.02	26.50	24.99	415

In [5]:

            
                Copied!
                
df.dtypes
df.dtypes

Out[5]:

Name              object
Date              object
Open             float64
Closing_Price    float64
Daily_High       float64
Daily_Low        float64
Volume            object
dtype: object

Change datatype of 'Date' to datetime

In [6]:

            
                Copied!
                
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = pd.to_datetime(df['Date'])

In [7]:

            
                Copied!
                
df['Date'].min(), df['Date'].max()
df['Date'].min(), df['Date'].max()

Out[7]:

(Timestamp('2010-01-04 00:00:00'), Timestamp('2020-04-03 00:00:00'))

In [8]:

            
                Copied!
                
pd.to_datetime(df['Date'].max()) - pd.to_datetime(df['Date'].min())
pd.to_datetime(df['Date'].max()) - pd.to_datetime(df['Date'].min())

Out[8]:

Timedelta('3742 days 00:00:00')

In [9]:

            
                Copied!
                
df['Name'].unique().tolist()
df['Name'].unique().tolist()

Out[9]:

['Accor',
 'Air Liquide',
 'Airbus ',
 'ArcelorMittal',
 'Atos',
 'AXA',
 'BNP Paribas',
 'Bouygues',
 'Cap Gemini',
 'Crédit Agricole',
 'Danone',
 'Dassault Systèmes',
 'Engie (ex GDF Suez',
 'EssilorLuxottica',
 'Hermès (Hermes International',
 'Kering',
 'LEGRAND',
 'LOréal',
 'LVMH Moet Hennessy Louis Vuitton',
 'Michelin (Compagnie Générale d Etablissements Michelin SCPA',
 'Orange',
 'Pernod Ricard',
 'Peugeot',
 'Publicis',
 'Renault',
 'SAFRAN',
 'Saint-Gobain',
 'Sanofi',
 'Schneider Electric',
 'Société Générale (Societe Generale',
 'Sodexo',
 'STMicroelectronics',
 'TOTAL',
 'Unibail-Rodamco',
 'Veolia Environnement',
 'VINCI',
 'Vivendi',
 'Worldline SA']

Lets choose Renault for this analysis.

In [10]:

            
                Copied!
                
COMPANY = 'Renault'
COMPANY = 'Renault'

In [11]:

            
                Copied!
                
df = df[df['Name'] == COMPANY]
df = df[df['Name'] == COMPANY]

In [12]:

            
                Copied!
                
df.shape
df.shape

Out[12]:

(2600, 7)

In [13]:

            
                Copied!
                
                    
                    
                
                

        
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=df['Date'], y=df['Closing_Price'], name=f"{COMPANY} Closing Prices over Time", mode="lines"))
fig.update_layout(
    title=f"{COMPANY} Closing Prices over Time", xaxis_title="Year", yaxis_title="Closing Price"
)
fig.show()
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=df['Date'], y=df['Closing_Price'], name=f"{COMPANY} Closing Prices over Time", mode="lines"))
fig.update_layout(
    title=f"{COMPANY} Closing Prices over Time", xaxis_title="Year", yaxis_title="Closing Price"
)
fig.show()

Renault Closing Prices over Time

Pre-processing¶

Data Statistics

In [14]:

            
                Copied!
                
df.describe().transpose()
df.describe().transpose()

Out[14]:

	count	mean	std	min	25%	50%	75%	max
Open	2599.0	59.534467	20.709364	14.90	39.5850	59.700	77.3900	99.35
Closing_Price	2600.0	59.525573	20.751167	14.85	39.5375	59.580	77.4025	99.56
Daily_High	2600.0	60.142442	20.872593	15.80	39.8625	60.240	78.2525	100.02
Daily_Low	2600.0	58.908731	20.592854	12.86	39.1075	58.995	76.5650	98.23

Check for missing values

In [15]:

            
                Copied!
                
df.isna().sum()
df.isna().sum()

Out[15]:

Name              0
Date              0
Open              1
Closing_Price     0
Daily_High        0
Daily_Low         0
Volume           44
dtype: int64

In [16]:

            
                Copied!
                
df[df['Open'].isna()]
df[df['Open'].isna()]

Out[16]:

	Name	Date	Open	Closing_Price	Daily_High	Daily_Low	Volume
64336	Renault	2012-07-30	NaN	36.0	36.15	35.56	3,591

Apparently this one row has missing Open price, which we can ignore since we want to forecast the closing price.

Lets retain just the date and closing prices, since we only need these for forecasting.

In [17]:

            
                Copied!
                
df = df[['Date', 'Closing_Price']]
df = df[['Date', 'Closing_Price']]

Also, lets set Date as the dataframe index.

In [18]:

            
                Copied!
                
df = df.set_index('Date')
df = df.set_index('Date')

Split the dataset¶

Lets split the dataset into:

Train
Validation
Test

In [19]:

            
                Copied!
                
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[: int(n*0.6)]
val_df = df[int(n*0.6): int(n*0.8)]
test_df = df[int(n*0.8): ]

# since we're only using closing prices
num_features = 1
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[: int(n*0.6)]
val_df = df[int(n*0.6): int(n*0.8)]
test_df = df[int(n*0.8): ]

# since we're only using closing prices
num_features = 1

Standardize the dataset¶

In [20]:

            
                Copied!
                
scaler = StandardScaler()
scaler = StandardScaler()

In [21]:

            
                Copied!
                
# we fit just on the train set to avoid data leakage
train_df = scaler.fit_transform(train_df)

val_df = scaler.transform(val_df)
test_df = scaler.transform(test_df)
# we fit just on the train set to avoid data leakage
train_df = scaler.fit_transform(train_df)

val_df = scaler.transform(val_df)
test_df = scaler.transform(test_df)

In [22]:

            
                Copied!
                
train_df.shape, val_df.shape, test_df.shape
train_df.shape, val_df.shape, test_df.shape

Out[22]:

((1560, 1), (520, 1), (520, 1))

Data windowing¶

LSTMs accept a 3-dimensional input:

(number of samples, time lag, number of features)

Number of samples: This represents the total number instances in a batch that the LSTM will get trained on, think of it as the number of rows.
Time lag: This is a very LSTM specific dimension. With LSTMs we are predicting future values of a feature using its own past values. Time lag represents the interval that the LSTM will look-back of the feature to predict the next future value.
Number of features: This represents the number of features for which we are forecasting, just closing prices in our case.

Given that our train, validation and test sets are 2 dimensional currently, we need a way to convert them into 3 dimensional so that LSTMs can consume them.

This is where data windowing comes in.

For our use-case, lets assume that the last 15 days can be used to predict the next day, thus time lag will be 15.

In [25]:

            
                Copied!
                
time_lag = 15
time_lag = 15

In [23]:

            
                Copied!
                
def make_lstm_compatible(X, y, time_lag):
    X_temp, y_temp = [], []
    for i in range(len(X) - time_lag):
        X_temp.append(X[i: (i + time_lag)])
        y_temp.append(y[i + time_lag])

    return np.array(X_temp), np.array(y_temp)
def make_lstm_compatible(X, y, time_lag):
    X_temp, y_temp = [], []
    for i in range(len(X) - time_lag):
        X_temp.append(X[i: (i + time_lag)])
        y_temp.append(y[i + time_lag])

    return np.array(X_temp), np.array(y_temp)

In [26]:

            
                Copied!
                
X_train, y_train = make_lstm_compatible(train_df, train_df, time_lag)
X_val, y_val = make_lstm_compatible(val_df, val_df, time_lag)
X_train, y_train = make_lstm_compatible(train_df, train_df, time_lag)
X_val, y_val = make_lstm_compatible(val_df, val_df, time_lag)

In [27]:

            
                Copied!
                
X_train.shape, y_train.shape
X_train.shape, y_train.shape

Out[27]:

((1545, 15, 1), (1545, 1))

Build Model¶

In [28]:

            
                Copied!
                
def build_model():
    model = Sequential()
    
    model.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
    model.add(Dropout(0.2))

    model.add(LSTM(units = 50, return_sequences = True))
    model.add(Dropout(0.2))

    model.add(LSTM(units = 50))
    model.add(Dropout(0.2))
    
    model.add(Dense(units=1))

    model.compile(loss='mean_squared_error',
                optimizer='adam')
    
    return model
def build_model():
    model = Sequential()
    
    model.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
    model.add(Dropout(0.2))

    model.add(LSTM(units = 50, return_sequences = True))
    model.add(Dropout(0.2))

    model.add(LSTM(units = 50))
    model.add(Dropout(0.2))
    
    model.add(Dense(units=1))

    model.compile(loss='mean_squared_error',
                optimizer='adam')
    
    return model

In [29]:

            
                Copied!
                
model = build_model()
model = build_model()

2022-11-29 16:28:55.302477: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-29 16:28:55.302842: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

In [30]:

            
                Copied!
                
model.summary()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm (LSTM)                 (None, 15, 50)            10400     
                                                                 
 dropout (Dropout)           (None, 15, 50)            0         
                                                                 
 lstm_1 (LSTM)               (None, 15, 50)            20200     
                                                                 
 dropout_1 (Dropout)         (None, 15, 50)            0         
                                                                 
 lstm_2 (LSTM)               (None, 50)                20200     
                                                                 
 dropout_2 (Dropout)         (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
=================================================================
Total params: 50,851
Trainable params: 50,851
Non-trainable params: 0
_________________________________________________________________

In [32]:

            
                Copied!
                
                    
                    
                
                

        
model.fit(X_train, 
          y_train, 
          validation_data=(X_val, y_val),
          epochs=25, 
          batch_size = 32,
          verbose=1
)
model.fit(X_train, 
          y_train, 
          validation_data=(X_val, y_val),
          epochs=25, 
          batch_size = 32,
          verbose=1
)

Epoch 1/25
49/49 [==============================] - ETA: 0s - loss: 0.0128

2022-11-29 16:30:52.618603: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-11-29 16:30:52.747079: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-11-29 16:30:52.794913: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-11-29 16:30:52.848000: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.

49/49 [==============================] - 2s 47ms/step - loss: 0.0128 - val_loss: 0.0115
Epoch 2/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0128 - val_loss: 0.0123
Epoch 3/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0122 - val_loss: 0.0062
Epoch 4/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0122 - val_loss: 0.0169
Epoch 5/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0116 - val_loss: 0.0054
Epoch 6/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0122 - val_loss: 0.0133
Epoch 7/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0114 - val_loss: 0.0052
Epoch 8/25
49/49 [==============================] - 1s 29ms/step - loss: 0.0112 - val_loss: 0.0068
Epoch 9/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0109 - val_loss: 0.0110
Epoch 10/25
49/49 [==============================] - 2s 31ms/step - loss: 0.0118 - val_loss: 0.0071
Epoch 11/25
49/49 [==============================] - 1s 29ms/step - loss: 0.0107 - val_loss: 0.0083
Epoch 12/25
49/49 [==============================] - 2s 33ms/step - loss: 0.0105 - val_loss: 0.0052
Epoch 13/25
49/49 [==============================] - 1s 30ms/step - loss: 0.0104 - val_loss: 0.0070
Epoch 14/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0108 - val_loss: 0.0051
Epoch 15/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0114 - val_loss: 0.0124
Epoch 16/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0105 - val_loss: 0.0063
Epoch 17/25
49/49 [==============================] - 1s 30ms/step - loss: 0.0112 - val_loss: 0.0094
Epoch 18/25
49/49 [==============================] - 1s 29ms/step - loss: 0.0111 - val_loss: 0.0049
Epoch 19/25
49/49 [==============================] - 1s 30ms/step - loss: 0.0112 - val_loss: 0.0047
Epoch 20/25
49/49 [==============================] - 1s 29ms/step - loss: 0.0106 - val_loss: 0.0072
Epoch 21/25
49/49 [==============================] - 1s 29ms/step - loss: 0.0104 - val_loss: 0.0048
Epoch 22/25
49/49 [==============================] - 1s 29ms/step - loss: 0.0104 - val_loss: 0.0096
Epoch 23/25
49/49 [==============================] - 1s 29ms/step - loss: 0.0101 - val_loss: 0.0064
Epoch 24/25
49/49 [==============================] - 1s 28ms/step - loss: 0.0100 - val_loss: 0.0057
Epoch 25/25
49/49 [==============================] - 1s 29ms/step - loss: 0.0105 - val_loss: 0.0101

Out[32]:

<keras.callbacks.History at 0x2c904c940>

Model Performance¶

Validation Set¶

In [33]:

            
                Copied!
                
val_pred = model.predict(X_val)
val_pred = model.predict(X_val)

2022-11-29 16:32:11.191235: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-11-29 16:32:11.312565: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-11-29 16:32:11.370563: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.

11/16 [===================>..........] - ETA: 0s

2022-11-29 16:32:11.432014: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.

16/16 [==============================] - 1s 15ms/step

In [34]:

            
                Copied!
                
# inverse transform to get actual closing prices
val_pred = scaler.inverse_transform(val_pred)
# inverse transform to get actual closing prices
val_pred = scaler.inverse_transform(val_pred)

In [35]:

            
                Copied!
                
val_pred_df = pd.DataFrame({'timestamp': df[int(n*0.6)+time_lag: int(n*0.8)].index, 'Pred': val_pred[:, 0]})
y_val_df = pd.DataFrame({'timestamp': df[int(n*0.6)+time_lag: int(n*0.8)].index, 'Actual': scaler.inverse_transform(y_val)[:, 0]})
val_pred_df = pd.DataFrame({'timestamp': df[int(n*0.6)+time_lag: int(n*0.8)].index, 'Pred': val_pred[:, 0]})
y_val_df = pd.DataFrame({'timestamp': df[int(n*0.6)+time_lag: int(n*0.8)].index, 'Actual': scaler.inverse_transform(y_val)[:, 0]})

In [36]:

            
                Copied!
                
                    
                    
                
                

        
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=val_pred_df['timestamp'], y=val_pred_df['Pred'], name="Predicted", mode="lines"))
fig.add_trace(go.Scatter(x=y_val_df['timestamp'], y=y_val_df['Actual'], name="Actual", mode="lines"))
fig.update_layout(
    title="Predicted Vs. Actual (Validation Set)", xaxis_title="Date", yaxis_title=f"{COMPANY} Closing Price"
)
fig.show()
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=val_pred_df['timestamp'], y=val_pred_df['Pred'], name="Predicted", mode="lines"))
fig.add_trace(go.Scatter(x=y_val_df['timestamp'], y=y_val_df['Actual'], name="Actual", mode="lines"))
fig.update_layout(
    title="Predicted Vs. Actual (Validation Set)", xaxis_title="Date", yaxis_title=f"{COMPANY} Closing Price"
)
fig.show()

Predicted Vs. Actual (Validation Set)

Looks pretty damn inline with actual values! 🤩

Lets take a look at RMSE next.

In [37]:

            
                Copied!
                
print('RMSE: ', np.sqrt(mean_squared_error(val_pred_df['Pred'], y_val_df['Actual'])))
print('RMSE: ', np.sqrt(mean_squared_error(val_pred_df['Pred'], y_val_df['Actual'])))

RMSE:  1.5987654314071114

RMSE looks decent too!

Test Set¶

Moment of truth...

In [38]:

            
                Copied!
                
X_test, y_test = make_lstm_compatible(test_df, test_df, time_lag)
X_test, y_test = make_lstm_compatible(test_df, test_df, time_lag)

In [39]:

            
                Copied!
                
test_pred = model.predict(X_test)
test_pred = model.predict(X_test)

16/16 [==============================] - 0s 11ms/step

In [40]:

            
                Copied!
                
# inverse transform to get actual closing prices
test_pred = scaler.inverse_transform(test_pred)
# inverse transform to get actual closing prices
test_pred = scaler.inverse_transform(test_pred)

In [41]:

            
                Copied!
                
test_pred_df = pd.DataFrame({'timestamp': df[int(n*0.8)+time_lag: ].index, 'Pred': test_pred[:, 0]})
y_test_df = pd.DataFrame({'timestamp': df[int(n*0.8)+time_lag: ].index, 'Actual': scaler.inverse_transform(y_test)[:, 0]})
test_pred_df = pd.DataFrame({'timestamp': df[int(n*0.8)+time_lag: ].index, 'Pred': test_pred[:, 0]})
y_test_df = pd.DataFrame({'timestamp': df[int(n*0.8)+time_lag: ].index, 'Actual': scaler.inverse_transform(y_test)[:, 0]})

In [42]:

            
                Copied!
                
                    
                    
                
                

        
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=test_pred_df['timestamp'], y=test_pred_df['Pred'], name="Predicted", mode="lines"))
fig.add_trace(go.Scatter(x=y_test_df['timestamp'], y=y_test_df['Actual'], name="Actual", mode="lines"))
fig.update_layout(
    title="Predicted Vs. Actual (Test Set)", xaxis_title="Date", yaxis_title=f"{COMPANY} Closing Price"
)
fig.show()
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=test_pred_df['timestamp'], y=test_pred_df['Pred'], name="Predicted", mode="lines"))
fig.add_trace(go.Scatter(x=y_test_df['timestamp'], y=y_test_df['Actual'], name="Actual", mode="lines"))
fig.update_layout(
    title="Predicted Vs. Actual (Test Set)", xaxis_title="Date", yaxis_title=f"{COMPANY} Closing Price"
)
fig.show()

Predicted Vs. Actual (Test Set)

This looks good, even on unseen data!

In [43]:

            
                Copied!
                
print('RMSE: ', np.sqrt(mean_squared_error(test_pred_df['Pred'], y_test_df['Actual'])))
print('RMSE: ', np.sqrt(mean_squared_error(test_pred_df['Pred'], y_test_df['Actual'])))

RMSE:  1.9242710045738585

RMSE is decent too!