Skip to content

Latest commit



695 lines (575 loc) · 13.5 KB

File metadata and controls

695 lines (575 loc) · 13.5 KB
import numpy as np
import pandas as pd
from pathlib import Path
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import warnings

Regression Analysis: Seasonal Effects with Sklearn Linear Regression

In this notebook, you will build a SKLearn linear regression model to predict Yen futures ("settle") returns with lagged Yen futures returns.

# Futures contract on the Yen-dollar exchange rate:
# This is the continuous chain of the futures contracts that are 1 month to expiration
yen_futures = pd.read_csv(
    Path("yen.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;

.dataframe thead th {
    text-align: right;
Open High Low Last Change Settle Volume Previous Day Open Interest
1976-08-02 3398.0 3401.0 3398.0 3401.0 NaN 3401.0 2.0 1.0
1976-08-03 3401.0 3401.0 3401.0 3401.0 NaN 3401.0 0.0 1.0
1976-08-04 3401.0 3401.0 3401.0 3401.0 NaN 3401.0 0.0 1.0
1976-08-05 3401.0 3401.0 3401.0 3401.0 NaN 3401.0 0.0 1.0
1976-08-06 3401.0 3401.0 3401.0 3401.0 NaN 3401.0 0.0 1.0
# Trim the dataset to begin on January 1st, 1990
yen_futures = yen_futures.loc["1990-01-01":, :]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;

.dataframe thead th {
    text-align: right;
Open High Low Last Change Settle Volume Previous Day Open Interest
1990-01-02 6954.0 6954.0 6835.0 6847.0 NaN 6847.0 48336.0 51473.0
1990-01-03 6877.0 6910.0 6865.0 6887.0 NaN 6887.0 38206.0 53860.0
1990-01-04 6937.0 7030.0 6924.0 7008.0 NaN 7008.0 49649.0 55699.0
1990-01-05 6952.0 6985.0 6942.0 6950.0 NaN 6950.0 29944.0 53111.0
1990-01-08 6936.0 6972.0 6936.0 6959.0 NaN 6959.0 19763.0 52072.0

Data Preparation


# Create a series using "Settle" price percentage returns, drop any nan"s, and check the results:
# (Make sure to multiply the pct_change() results by 100)
# In this case, you may have to replace inf, -inf values with np.nan"s

returns = yen_futures["Settle"].pct_change() * 100

returns = returns.replace(-np.inf, np.nan).dropna()
yen_futures['Returns']= returns

<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;

.dataframe thead th {
    text-align: right;
Open High Low Last Change Settle Volume Previous Day Open Interest Returns
2019-10-09 9381.0 9391.5 9330.5 9343.5 38.5 9338.0 99153.0 145470.0 -0.410601
2019-10-10 9343.5 9380.5 9293.5 9301.0 34.5 9303.5 159397.0 144474.0 -0.369458
2019-10-11 9308.5 9309.0 9240.0 9267.0 52.5 9251.0 158810.0 147471.0 -0.564304
2019-10-14 9259.0 9292.0 9250.5 9261.0 14.0 9265.0 69457.0 153902.0 0.151335
2019-10-15 9264.5 9280.0 9216.5 9220.0 43.5 9221.5 108342.0 151564.0 -0.469509

Lagged Returns

# Create a lagged return using the shift function

yen_futures['Lagged_Return'] = yen_futures.Returns.shift()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;

.dataframe thead th {
    text-align: right;
Open High Low Last Change Settle Volume Previous Day Open Interest Returns Lagged_Return
2019-10-09 9381.0 9391.5 9330.5 9343.5 38.5 9338.0 99153.0 145470.0 -0.410601 0.170931
2019-10-10 9343.5 9380.5 9293.5 9301.0 34.5 9303.5 159397.0 144474.0 -0.369458 -0.410601
2019-10-11 9308.5 9309.0 9240.0 9267.0 52.5 9251.0 158810.0 147471.0 -0.564304 -0.369458
2019-10-14 9259.0 9292.0 9250.5 9261.0 14.0 9265.0 69457.0 153902.0 0.151335 -0.564304
2019-10-15 9264.5 9280.0 9216.5 9220.0 43.5 9221.5 108342.0 151564.0 -0.469509 0.151335

Train Test Split

# Create a train/test split for the data using 2018-2019 for testing and the rest for training

train = yen_futures[:'2017']
test = yen_futures['2018':]
# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
X_train = train["Lagged_Return"].to_frame()
X_test = test["Lagged_Return"].to_frame()

y_train = train["Returns"]
y_test = test["Returns"]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;

.dataframe thead th {
    text-align: right;
2014-02-18 0.409123
2014-02-19 -0.427829
2014-02-20 -0.020460
2014-02-21 0.020465
2014-02-24 -0.204604

Linear Regression Model

# Create a Linear Regression model and fit it to the training data
model = LinearRegression()

# Fit a SKLearn linear regression using just the training set (X_train, Y_train):, y_train)

Make predictions using the Testing Data

Note: We want to evaluate the model using data that it has never seen before, in this case: X_test.

# Make a prediction of "y" values using just the test dataset
predictions = model.predict(X_test)
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = y_test.to_frame()
Results['Predicted Returns'] = predictions
# Plot the first 20 predictions vs the true values
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f97c2942a10>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f97c2968a10>],


Out-of-Sample Performance

Evaluate the model using "out-of-sample" data (X_test and y_test)

# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 

mse = mean_squared_error(Results["Returns"], Results['Predicted Returns']
# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):

rmse = np.sqrt(mse)
print(f"Out-of-Sample Performance Root Mean Squared Error (RMSE): {rmse}")
Out-of-Sample Performance Root Mean Squared Error (RMSE): 0.41545437184712763

In-Sample Performance

Evaluate the model using in-sample data (X_train and y_train)

# Construct a dataframe using just the "y" training data:

in_sample_results = y_train.to_frame()

# Add a column of "in-sample" predictions to that dataframe:  
in_sample_results['In-sample Predictions'] = model.predict(X_train)

# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
in_sample_mse = mean_squared_error(in_sample_results['Returns'], in_sample_results['In-sample Predictions']

# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
in_sample_rmse = np.sqrt(in_sample_mse)
print(f"In-of-Sample Performance Root Mean Squared Error (RMSE): {in_sample_rmse}")
In-of-Sample Performance Root Mean Squared Error (RMSE): 0.5962037920929946



print(f"Out-of-Sample Performance Root Mean Squared Error (RMSE): {rmse} is lower than In-of-Sample Performance Root Mean Squared Error (RMSE): {in_sample_rmse} so Out-of-Sample data are more significant")
Out-of-Sample Performance Root Mean Squared Error (RMSE): 0.41545437184712763 is lower than In-of-Sample Performance Root Mean Squared Error (RMSE): 0.5962037920929946 so Out-of-Sample data are more significant