#################################################################################
##############        Forecasting:     FT         ###############################
##############     ----------- solution ---------    ############################
#################################################################################


library(MLTools)
##    __    __   _            _________
##   |  \  /  | | |          |___   ___| _____   _____   _      _____
##   |   \/   | | |     ____     | |    |  _  | |  _  | | |    |  ___|
##   | |\  /| | | |    |____|    | |    | | | | | | | | | |    |___  |
##   | | \/ | | | |____          | |    | |_| | | |_| | | |__   ___| |
##   |_|    |_| |______|         |_|    |_____| |_____| |____| |_____|
## 
##            Learning is fun, Machine Learning is funnier
##             -----------------------------------------
##            With great power comes great responsibility
##             -----------------------------------------
## Created by: Jos<e9> Portela Gonz<e1>lez <Jose.Portela@iit.comillas.edu>
## Guillermo Mestre Marcos <Guillermo.Mestre@comillas.edu> Jaime Pizarroso Gonzalo
## <jpizarroso@comillas.edu> Antonio Mu<f1>oz San Roque
## <antonio.munoz@iit.comillas.edu>
## 
## Escuela T<e9>cnica Superior de Ingenier<ed>a ICAI
library(fpp2)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ ggplot2   3.5.2      ✔ fma       2.5   
## ✔ forecast  8.24.0     ✔ expsmooth 2.3
library(ggplot2)
library(TSA)
## Registered S3 methods overwritten by 'TSA':
##   method       from    
##   fitted.Arima forecast
##   plot.Arima   forecast
## 
## Attaching package: 'TSA'
## 
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## 
## The following object is masked from 'package:utils':
## 
##     tar
library(lmtest)  #contains coeftest function
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tseries) #contains adf.test function
library(Hmisc) # for computing lagged variables
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Set working directory ---------------------------------------------------------------------------------------------
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# # Code to import data from Python statsmodels via reticulate
# library(reticulate)
# library(tidyverse)
# use_condaenv("fc", required = TRUE)
# 
# # Import the statsmodels module
# sm <- import("statsmodels.api")
# # Load the dataset into a pandas DataFrame
# macro_data <- py_to_r(sm$datasets$macrodata$load_pandas()$data)
# macro_data <- macro_data %>% 
#   select(year, quarter, realgdp, realcons, unemp)
# 
# write.csv(macro_data, file = "macro_data.csv", row.names = FALSE)



## Load dataset -------------------------------------------------------------------------------------------------------
fdata <- read.csv("macro_data.csv")

# View first few rows
head(fdata)
##   year quarter  realgdp realcons unemp
## 1 1959       1 2710.349   1707.4   5.8
## 2 1959       2 2778.801   1733.7   5.1
## 3 1959       3 2775.488   1751.8   5.3
## 4 1959       4 2785.204   1753.7   5.6
## 5 1960       1 2847.699   1770.5   5.2
## 6 1960       2 2834.390   1792.9   5.2
# Convert to time series object
freq <- 4
fdata_ts <- ts(fdata[,-(1:2)], frequency = freq)

head(fdata_ts)
##       realgdp realcons unemp
## 1 Q1 2710.349   1707.4   5.8
## 1 Q2 2778.801   1733.7   5.1
## 1 Q3 2775.488   1751.8   5.3
## 1 Q4 2785.204   1753.7   5.6
## 2 Q1 2847.699   1770.5   5.2
## 2 Q2 2834.390   1792.9   5.2
autoplot(fdata_ts, facets = TRUE)

ggAcf(fdata_ts[,1], lag= 15 * freq)

# Scales
scale_y <- 1000
scale_x <- c(1000,1)

# Create time series and scale values 
y <- fdata_ts[ ,1]/scale_y
x1 <- fdata_ts[ , 2]/scale_x[1]
x2 <- fdata_ts[ , 3]/scale_x[2]


x <- cbind(x1, x2)
fdata_ts <- cbind(y, x1, x2)
head(fdata_ts)
##             y     x1  x2
## 1 Q1 2.710349 1.7074 5.8
## 1 Q2 2.778801 1.7337 5.1
## 1 Q3 2.775488 1.7518 5.3
## 1 Q4 2.785204 1.7537 5.6
## 2 Q1 2.847699 1.7705 5.2
## 2 Q2 2.834390 1.7929 5.2
autoplot(fdata_ts, facets = TRUE)

y_orig <- fdata_ts[,1]

# Length of training and validation subsets
len_TV <- 12
len_TR <- nrow(fdata_ts) - len_TV

# Use it to create train and test subsets
ind <- seq(1:nrow(fdata))
ind_TR <- ind <= len_TR
ind_TV <- !ind_TR

# Training subset
y.tr <- ts(y[ind_TR], frequency = freq)
x.tr <- ts(x[ind_TR,], frequency = freq)

autoplot(cbind(y.tr,x.tr), facets = TRUE)

# Test subset

y.ts <- ts(y[ind_TV], frequency = freq) 
# ¡Cuidado, no hemos usado start!
head(y.ts)
##       Qtr1     Qtr2     Qtr3     Qtr4
## 1 13.06068 13.09990 13.20398 13.32111
## 2 13.39125 13.36687
y.ts <- ts(y[ind_TV], frequency = freq,
           start = end(y.tr) + c(0,1)) 
head(y.ts)
##        Qtr1     Qtr2     Qtr3     Qtr4
## 48                            13.06068
## 49 13.09990 13.20398 13.32111 13.39125
## 50 13.36687
# sometimes you don't have the test output
# but we assume here that you have the test inputs
x.ts <- ts(x[ind_TV,], frequency = freq,
           start = end(x.tr) + c(0,1))
head(x.ts)
##           x1  x2
## 48 Q4 9.1816 4.4
## 49 Q1 9.2651 4.5
## 49 Q2 9.2915 4.5
## 49 Q3 9.3356 4.7
## 49 Q4 9.3636 4.8
## 50 Q1 9.3496 4.9
autoplot(tail(y.tr, 50), series = "Training", color="blue") + 
  forecast::autolayer(y.ts, series = "Test", color = "red") + 
  geom_vline(xintercept = time(tail(y.tr, 10)), linetype="dashed", size=0.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Model identification 

ggtsdisplay(y.tr, lag=50)

# BoxCox.lambda.plot(y.tr, window = freq)
BoxCox.lambda.plot(y.tr)
## `geom_smooth()` using formula = 'y ~ x'

##    log.ma 
## 0.2876229
## Identification and fitting process -------------------------------------------------------------------------------------------------------
#(The different steps followed are represented for teaching purposes)


## FIRST --------------------------------------------
#### Fit initial FT model with large s
#This arima function belongs to the TSA package
TF.fit <- arima(y.tr,
                xtransf = x.tr,
                order=c(1,0,0),
                # seasonal = list(order=c(1,0,0), period=freq),
                transfer = list(c(0,9),c(0,9)),
                method="ML")

TF.RegressionError.plot(y.tr,x.tr,TF.fit,lag.max = 50)
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Regular differentiation is needed and coefficients of explanatory variables cannot be interpreted

## SECOND --------------------------------------------
#### Fit initial FT model with large s including regular differentiation
TF.fit <- arima(y.tr,
                xtransf = x.tr,
                order=c(1,1,0),
                # seasonal = list(order=c(1,0,0),period=freq),
                transfer = list(c(0,9),c(0,9)),
                method="ML")
summary(TF.fit) #summary of training errors and estimated coefficients
## 
## Call:
## arima(x = y.tr, order = c(1, 1, 0), method = "ML", xtransf = x.tr, transfer = list(c(0, 
##     9), c(0, 9)))
## 
## Coefficients:
##           ar1  x1-MA0  x1-MA1  x1-MA2   x1-MA3  x1-MA4   x1-MA5   x1-MA6
##       -0.2448  0.9126  0.3677  0.1395  -0.1883  0.0586  -0.1506  -0.0461
## s.e.   0.0733  0.1025  0.1119  0.1125   0.1149  0.1156   0.1152   0.1157
##        x1-MA7  x1-MA8  x1-MA9   x2-MA0   x2-MA1  x2-MA2   x2-MA3  x2-MA4
##       -0.0619  0.1785  0.1090  -0.0545  -0.0117  0.0149  -0.0022  0.0012
## s.e.   0.1157  0.1159  0.1113   0.0124   0.0158  0.0155   0.0153  0.0153
##        x2-MA5  x2-MA6  x2-MA7  x2-MA8   x2-MA9
##       -0.0127  0.0025  0.0026  0.0095  -0.0056
## s.e.   0.0152  0.0150  0.0150  0.0148   0.0103
## 
## sigma^2 estimated as 0.001092:  log likelihood = 360.36,  aic = -678.72
## 
## Training set error measures:
##                     ME       RMSE        MAE        MPE      MAPE      MASE
## Training set 0.0030248 0.03294917 0.02551225 0.06547611 0.3936668 0.4036532
##                     ACF1
## Training set -0.02925025
coeftest(TF.fit) #statistical significance of estimated coefficients
## 
## z test of coefficients:
## 
##          Estimate Std. Error z value  Pr(>|z|)    
## ar1    -0.2448429  0.0733336 -3.3388 0.0008415 ***
## x1-MA0  0.9126065  0.1025226  8.9015 < 2.2e-16 ***
## x1-MA1  0.3676982  0.1119134  3.2856 0.0010178 ** 
## x1-MA2  0.1394587  0.1124597  1.2401 0.2149465    
## x1-MA3 -0.1883219  0.1149311 -1.6386 0.1013043    
## x1-MA4  0.0586236  0.1155526  0.5073 0.6119217    
## x1-MA5 -0.1506487  0.1152302 -1.3074 0.1910866    
## x1-MA6 -0.0461404  0.1157331 -0.3987 0.6901295    
## x1-MA7 -0.0619178  0.1157350 -0.5350 0.5926523    
## x1-MA8  0.1784654  0.1159314  1.5394 0.1237053    
## x1-MA9  0.1090172  0.1113263  0.9793 0.3274525    
## x2-MA0 -0.0545344  0.0123805 -4.4049 1.059e-05 ***
## x2-MA1 -0.0116954  0.0157718 -0.7415 0.4583657    
## x2-MA2  0.0148546  0.0155276  0.9567 0.3387399    
## x2-MA3 -0.0021988  0.0153007 -0.1437 0.8857335    
## x2-MA4  0.0012191  0.0153276  0.0795 0.9366062    
## x2-MA5 -0.0127058  0.0152362 -0.8339 0.4043249    
## x2-MA6  0.0025426  0.0149971  0.1695 0.8653696    
## x2-MA7  0.0026165  0.0150071  0.1744 0.8615893    
## x2-MA8  0.0095000  0.0147512  0.6440 0.5195661    
## x2-MA9 -0.0055924  0.0103251 -0.5416 0.5880764    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Check regression error to see the need of differentiation
TF.RegressionError.plot(y.tr,x.tr,TF.fit,lag.max = 50)
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Regression error is stationary, then, we can continue
#NOTE: If this regression error was not stationary in variance,boxcox should be applied to input and output series.

#Check numerator coefficients of explanatory variable
TF.Identification.plot(x.tr,TF.fit)
## [1] "x1"
##           Estimate Std. Error    z value     Pr(>|z|)
## x1-MA0  0.91260651  0.1025226  8.9015152 5.508941e-19
## x1-MA1  0.36769822  0.1119134  3.2855616 1.017793e-03
## x1-MA2  0.13945874  0.1124597  1.2400780 2.149465e-01
## x1-MA3 -0.18832188  0.1149311 -1.6385631 1.013043e-01
## x1-MA4  0.05862359  0.1155526  0.5073323 6.119217e-01
## x1-MA5 -0.15064874  0.1152302 -1.3073715 1.910866e-01
## x1-MA6 -0.04614041  0.1157331 -0.3986794 6.901295e-01
## x1-MA7 -0.06191781  0.1157350 -0.5349964 5.926523e-01
## x1-MA8  0.17846544  0.1159314  1.5394058 1.237053e-01
## x1-MA9  0.10901718  0.1113263  0.9792580 3.274525e-01
## [1] "x2"
##            Estimate Std. Error     z value     Pr(>|z|)
## x2-MA0 -0.054534361 0.01238050 -4.40485820 1.058531e-05
## x2-MA1 -0.011695449 0.01577182 -0.74154068 4.583657e-01
## x2-MA2  0.014854636 0.01552764  0.95665794 3.387399e-01
## x2-MA3 -0.002198784 0.01530069 -0.14370491 8.857335e-01
## x2-MA4  0.001219098 0.01532761  0.07953608 9.366062e-01
## x2-MA5 -0.012705826 0.01523623 -0.83392209 4.043249e-01
## x2-MA6  0.002542645 0.01499706  0.16954291 8.653696e-01
## x2-MA7  0.002616509 0.01500710  0.17435145 8.615893e-01
## x2-MA8  0.009499993 0.01475121  0.64401432 5.195661e-01
## x2-MA9 -0.005592361 0.01032514 -0.54162566 5.880764e-01

## Transfer function parameters for more than one input

# Choose the values below using the identification plots
#x1:   
b1=0; r1=1; s1=0;
#x2:   
b2=0; r2=0; s2=0;


# Next apply the lags to the inputs
xlag <- x.tr
xlag[,1] <- Lag(x.tr[,1],b1) # b1
xlag[,2] <- Lag(x.tr[,2],b2) # b2

xlag[is.na(xlag)]=0

# and fit the arima model with those parameters
# and the ARMA structure (p, d, q)(P, D, Q)_s

arima.fit <- arima(y.tr,
                   xtransf = xlag,
                   order=c(0, 1, 1),
                   # seasonal = list(order=c(0,0,0),period=freq),
                   transfer = list(c(r1,s1), c(r2,s2)),
                   method="ML")


summary(arima.fit) #summary of training errors and estimated coefficients
## 
## Call:
## arima(x = y.tr, order = c(0, 1, 1), method = "ML", xtransf = xlag, transfer = list(c(r1, 
##     s1), c(r2, s2)))
## 
## Coefficients:
##           ma1  x1-AR1  x1-MA0   x2-MA0
##       -0.2989  0.0153  1.2520  -0.0527
## s.e.   0.0723  0.0169  0.0466   0.0068
## 
## sigma^2 estimated as 0.001286:  log likelihood = 362.66,  aic = -717.33
## 
## Training set error measures:
##                       ME       RMSE        MAE       MPE      MAPE      MASE
## Training set 0.005980813 0.03577307 0.02732755 0.1184703 0.4490564 0.4323748
##                     ACF1
## Training set -0.02735941
coeftest(arima.fit) #statistical significance of estimated coefficients
## 
## z test of coefficients:
## 
##          Estimate Std. Error z value  Pr(>|z|)    
## ma1    -0.2988737  0.0722874 -4.1345 3.557e-05 ***
## x1-AR1  0.0152844  0.0168652  0.9063    0.3648    
## x1-MA0  1.2520229  0.0465576 26.8919 < 2.2e-16 ***
## x2-MA0 -0.0527139  0.0068213 -7.7279 1.093e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Check residuals
CheckResiduals.ICAI(arima.fit, lag=50)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 42.195, df = 46, p-value = 0.6323
## 
## Model df: 4.   Total lags used: 50
#All coefficients significant --> Good.
#All correlations are small   --> Good.


#Check Cross correlation residuals - expl. variables
res <- residuals(arima.fit)
res[is.na(res)] <- 0
ccf(y = res, x = x.tr[,1])

ccf(y = res, x = x.tr[,2])

########
#All expl.variables seem not correlated with residuals -> Good 


## Finished model identification

#Check fitted
autoplot(y.tr, series = "Real")+
  forecast::autolayer(fitted(arima.fit), series = "Fitted")

#Chech training errors of the model
accuracy(fitted(arima.fit),y.tr)
##                   ME       RMSE        MAE       MPE      MAPE        ACF1
## Test set 0.005980813 0.03577307 0.02732755 0.1184703 0.4490564 -0.02735941
##          Theil's U
## Test set  0.497303
# Here for h=12 we forecast all the 12 values  at once
y_est <- TF.forecast(y.old = y.tr, #past values of the series
                     x.old = x.tr, #Past values of the explanatory variables
                     x.new = x.ts, #New values of the explanatory variables
                     model = arima.fit, #fitted transfer function model
                     h=12) #Forecast horizon


y_est
##  [1] 13.11702 13.21805 13.25273 13.29793 13.32857 13.30632 13.28145 13.14555
##  [9] 13.00587 12.95861 12.87557 12.93799
y_est <- ts(y_est, frequency = freq,
                start = end(y.tr) + c(0,1))


# Plot the forecast and real values

autoplot(y.ts, series = "Test set (real values)") +
  forecast::autolayer(y_est , series = "Forecast h=1 for the test") +
  forecast::autolayer( tail(y.tr, 24), series = "Training data" )

# Accuracy in test
accuracy(y_est * scale_y, y.ts * scale_y)
##                ME     RMSE      MAE       MPE      MAPE      ACF1 Theil's U
## Test set 34.76367 91.57477 77.50328 0.2596796 0.5862351 0.6163587 0.8564871
# Saved values
write.table(y_est * scale_y, file = "Pred_TF_2inputs.dat", 
            col.names = FALSE, row.names = FALSE)
getwd()
## [1] "/Users/fernando/code/frcst_imat_public/2025_10_29_TransferFunctions_2"