Business Case - Price Forecasting - Autoregressive Integrated Moving Average Model (ARIMA)
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA
# define a function to transfer the type of 'Month' from string to date
def dateparse(x):
return datetime.strptime(x, '%Y-%m')
# import data from csv
price = pd.read_csv('arima-sample-data-csv.csv', index_col=[0], parse_dates=[0], date_parser=dateparse)
price.head(10)
| Price | |
|---|---|
| Month | |
| 2019-01-01 | 270 |
| 2019-02-01 | 150 |
| 2019-03-01 | 182 |
| 2019-04-01 | 120 |
| 2019-05-01 | 177 |
| 2019-06-01 | 165 |
| 2019-07-01 | 250 |
| 2019-08-01 | 244 |
| 2019-09-01 | 194 |
| 2019-10-01 | 133 |
# draw a graph using matplot function
def test_stationarity(data):
#Determing statistical properties
mean = data.rolling(window=12).mean()
std = data.rolling(window=12).std()
#Plot rolling statistics:
original = plt.plot(data, label='Original')
mean = plt.plot(mean, color='red', label='Rolling Mean')
std = plt.plot(std, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.ylabel('Purchase Price', fontsize=11)
plt.title('Concrete Purchase Price 2019-2021')
plt.xticks(fontsize=8)
plt.show(block=False)
test_stationarity(price)
# differencing the non stationary to stationary data
# by integrating of order 1
price_diff = price.diff(periods=1)
price_diff.plot()
plt.ylabel('Purchase Price after Differencing', fontsize=11)
plt.title('Concrete Purchase Price after Differencing 2019-2021')
leg = plt.legend(loc='upper left')
plt.show()
# After differencing, the mean and other statistical properties return to constant
# Therefore, the parameter d in the ARIMA model should be 1
# split the testing and training data (80%=train, 20%=test, random state=10)
x = price.values
x_train, x_test = train_test_split(x, test_size=0.2, random_state=10)
# pick the ARIMA model with lowest AIC - choose the parameter p, d, q
import itertools
import warnings
warnings.filterwarnings('ignore')
p = range(10, 13)
q = range(0, 2)
d = range(1,2)
pdq = list(itertools.product(p, d, q))
smallest_aic = 10000
result_pdq = 0
for parameter in pdq:
try:
model_arima = ARIMA(x_train, order=parameter)
model_arima_fit = model_arima.fit()
if model_arima_fit.aic < smallest_aic:
smallest_aic = model_arima_fit.aic
result_pdq = parameter
except:
pass
print('These are parameters p,d,q with the lowest AIC:{}.'.format(result_pdq))
These are parameters p,d,q with the lowest AIC:(11, 1, 1).
# draw a graph with most fitted ARIMA model
model_arima = ARIMA(x_train, order=result_pdq)
model_arima_fit = model_arima.fit()
x_pred = model_arima_fit.forecast(steps=10)
plt.xlabel('Next Ten Time Periods', fontsize=11)
plt.ylabel('Price', fontsize=11)
plt.title('Price Forecasting for Next Ten Time Periods')
plt.plot(x_test, label='Actual Price')
plt.plot(x_pred, label='Predicted Price', color='red')
plt.ylim(0, 1000)
leg = plt.legend(loc='lower left')
plt.show()
# the summary of the most fitted ARIMA model
print('Function Related Information')
print('AIC Number with Most Fitted ARIMA Model', model_arima_fit.aic)
print('-------------------------------------------------------------')
print('Predicted Price for the Next Ten Time Period', x_pred)
Function Related Information AIC Number with Most Fitted ARIMA Model 338.47178354139584 ------------------------------------------------------------- Predicted Price for the Next Ten Time Period [357.03912331 539.09632475 320.97851364 369.61425201 265.9263721 152.21175181 426.33373751 359.75924127 363.63837398 431.91655858]