Section 1: data preparation

The log return or equivalently the continuous grow rate from \(P_1\) to \(P2\) is
\[ log(P_2) - log(P_2) = log(\frac{P2}{P1})\]. F.ex. consider log(96/95)=log(1.01053)~0.0105, which equals a growth rate of 1.05%. The percentage change would be \[\frac{P2-P1}{P1}=\frac{96-95}{95}~0.0105\], so we can see that this approximation works.

# install libraries
if (!require("quantmod")) install.packages("quantmod")
## Loading required package: quantmod
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'quantmod'
## Installing package into '/Users/runner/work/_temp/Library'
## (as 'lib' is unspecified)
## also installing the dependencies 'xts', 'zoo', 'TTR'
## 
## The downloaded binary packages are in
##  /var/folders/tb/y368xp_x10s3ty1b_mtl5mxr0000gn/T//RtmpV9w6QE/downloaded_packages
# Load required libraries
suppressMessages({ library(quantmod); library(xts)
})
# Download S&P 500 index data (2020-2023, i.e. 4 years)
getSymbols("^GSPC", from="2020-01-01", to="2023-12-31", auto.assign=TRUE, warnings=FALSE)
## [1] "GSPC"
prices <- Cl(GSPC) # closing prices (252*4 = 1008)

print(length(prices))
## [1] 1006
# log(P_t) - log(P_{t-1}) = approximation of a percentage change (for small changes)
# continuously compounded rate of return (i.e. the log return)
returns <- diff(log(prices))[-1]  

print(returns)
##               GSPC.Close
## 2020-01-03 -0.0070849094
## 2020-01-06  0.0035271452
## 2020-01-07 -0.0028071751
## 2020-01-08  0.0048904735
## 2020-01-09  0.0066332141
## 2020-01-10 -0.0028592625
## 2020-01-13  0.0069519940
## 2020-01-14 -0.0015156808
## 2020-01-15  0.0018684502
## 2020-01-16  0.0083317474
##        ...              
## 2023-12-15 -0.0000762524
## 2023-12-18  0.0045181223
## 2023-12-19  0.0058492675
## 2023-12-20 -0.0147931480
## 2023-12-21  0.0102487693
## 2023-12-22  0.0016586822
## 2023-12-26  0.0042227610
## 2023-12-27  0.0014294356
## 2023-12-28  0.0003701061
## 2023-12-29 -0.0028304770
print(prices)
##            GSPC.Close
## 2020-01-02    3257.85
## 2020-01-03    3234.85
## 2020-01-06    3246.28
## 2020-01-07    3237.18
## 2020-01-08    3253.05
## 2020-01-09    3274.70
## 2020-01-10    3265.35
## 2020-01-13    3288.13
## 2020-01-14    3283.15
## 2020-01-15    3289.29
##        ...           
## 2023-12-15    4719.19
## 2023-12-18    4740.56
## 2023-12-19    4768.37
## 2023-12-20    4698.35
## 2023-12-21    4746.75
## 2023-12-22    4754.63
## 2023-12-26    4774.75
## 2023-12-27    4781.58
## 2023-12-28    4783.35
## 2023-12-29    4769.83
# Calculate realized variance (proxy using squared returns)
rv <- as.numeric(returns^2 * 252) # Annualized realized variance 

rv <- rv[!is.na(rv)] # Remove NAs

head(rv)
## [1] 0.012649377 0.003135070 0.001985818 0.006027016 0.011087881 0.002060196
print( length(rv) )
## [1] 1005
# Create HAR regressors
n <- length(rv) 

start_idx <- 23

# Pre-allocate matrices
n_obs <- n - start_idx + 1

y <- numeric(n_obs); X <- matrix(NA, n_obs, 4)

length(y)
## [1] 983
colnames(X) <- c("Intercept", "Daily", "Weekly", "Monthly")
# Fill data matrices
for(i in 1:n_obs) {

t <- start_idx + i - 1
# Dependent variable
y[i] <- rv[t]
# Regressors (regressor matrix)
X[i, 1] <- 1 # intercept
X[i, 2] <- rv[t-1] # daily rv
X[i, 3] <- mean(rv[(t-5):(t-1)]) # weekly realized variance (5 trading days)
X[i, 4] <- mean(rv[(t-22):(t-1)])  # monthly rv

}

T_obs <- length(y)

dim(X) # shape of the regressor matrix
## [1] 983   4
length(y) # length of the dep var vector
## [1] 983
head(y) # dependent variable
## [1] 0.0315419306 0.0027779043 0.0073905408 0.0134279399 0.0007172256
## [6] 0.0104573628
head(X) # regressor matrix
##      Intercept        Daily     Weekly    Monthly
## [1,]         1 0.0557163321 0.03039275 0.01523969
## [2,]         1 0.0315419306 0.03666322 0.01609844
## [3,]         1 0.0027779043 0.03672521 0.01608221
## [4,]         1 0.0073905408 0.02211875 0.01632788
## [5,]         1 0.0134279399 0.02217093 0.01666428
## [6,]         1 0.0007172256 0.01117111 0.01619289

Section 2: MCMC functions

The Gibbs sampler generates a Markov chain with stationary distribution \(f\) from a at least two dimensional probability density function (or probability mass function) \(f\) on the discrete, finite sample space \(\Omega \in \mathbb{R}^{d}, d \geq 2\) by sequentially updating coordinates of \(\boldsymbol{x} \in \Omega\). We don’t need proposal functions because the Gibbs sampler draws from the conditional distribution of \(X_k|X_{-k}=x_k\) to update entry \(k\) of \(\boldsymbol{x}\). In this case, we use a systematic-scan Gibbs-sampler because we cycle through the parameters of interest, \(\beta\) and \(\sigma^{2}\) in a fixed order.

# Gibbs sampler for Bayesian HAR
gibbs_har <- function(y, 
X, 
n_iter = 10000, 
burnin = 2000
) 
{ # Prior parameters
beta_0 <- c(0, 0.4, 0.3, 0.2) # Prior means
V_0 <- diag(c(100, 0.04, 0.04, 0.04)) # Prior covariance 
a_0 <- 2; b_0 <- 1 # InvGamma parameters for sigma^2

a_0

# Initialize
k <- ncol(X); T_obs <- length(y); beta <- beta_0; sigma2 <- 1



# Store only post-burnin samples
n_keep <- n_iter - burnin
beta_draws <- matrix(NA, n_keep, k); sigma2_draws <- numeric(n_keep)

# Precompute
V_0_inv <- solve(V_0); XtX <- t(X) %*% X; Xty <- t(X) %*% y

##print(V_0_inv)


# V_n is the posterior variance-covariance matrix of the regressors; diagonal elements: var, off-diagonal elements: covariances between the coefficients
# beta_n and V_n are the parameters of the multivariate normal distribution of the beta parameter
# a_n and b_n are the parameters of the inverse gamma distribution of the sigma^2 parameter

# main loop (systematic scan Gibs-sampler)
for (iter in 1:n_iter) {
# Update beta | sigma2, y
V_n_inv <- V_0_inv + XtX / sigma2
V_n <- solve(V_n_inv) 
beta_n <- V_n %*% (V_0_inv %*% beta_0 + Xty / sigma2) 
beta <- drop(beta_n + t(chol(V_n)) %*% rnorm(k))
# Update sigma2 | beta, y
residuals <- y - X %*% beta
a_n <- a_0 + T_obs / 2 
b_n <- b_0 + sum(residuals^2) / 2 
sigma2 <- 1 / rgamma(1, a_n, b_n)

# Store post-burnin draws only
if (iter > burnin) {
beta_draws[iter - burnin, ] <- beta; sigma2_draws[iter - burnin] <- sigma2
  } # end if function

 } # end for loop

return(list(beta = beta_draws,
              sigma2 = sigma2_draws))

} # end function body

Section 3: extracting the results

# results$beta and results$sigma2 contain the empirical (posterior) distributions of the two estimated parameters of interest
results <- gibbs_har(y, X)

colMeans(results$beta) # posterior means of the volatility persistence params: intercept, daily rv, weekly rv, monthly rv
## [1] 0.01323416 0.23584119 0.43132166 0.08930988
mean(results$sigma2) # posterior mean for the error variane
## [1] 0.03167416

Section 4: MCMC diagnostics

Recap of the Markov Chain Monte Carlo algorithm:

Two main problems arise:

(P1) can be addressed by discarding a large chunk of initial observations (burn-in sample) or by increasing the no. of iterations (MCMC sample size)

Markov Chain Monte Carlo (MCMC) diagnostics are tools that can be used to check whether the quality of a sample generated with an MCMC algorithm is sufficient to provide an accurate approximation of the target distribution.

In particular, MCMC diagnostics are used to check:

Diagnostic tools:

4.1 Trace plots

# trace plot for the daily realized volatility coefficient, i.e. the path the sampler visited to map out the posterior distribution
plot(results$beta[, 2], type = "l", col = "steelblue", 
     main = "Trace Plot for beta_d (daily rv)",
     xlab = "MCMC Iteration (t)", ylab = "Value")

trace plot of all mcmc samples including the burn-in samples

# new function call with burn-in samples
results <- gibbs_har(y, X, 
n_iter = 10000, 
burnin = 0 # setting the no. of burnin samples to 0 to plot all mcmc samples
)

# Now results$beta contains all 10,000 iterations starting from iteration 1
plot(results$beta[, 2], type = "l", col = "steelblue", 
main = "Trace Plot for Beta_d (daily rv) with burn-in")

# Add a red dashed line to mark the end of burn-in
abline(v = 2000, col = "red", lty = 2, lwd = 2)
legend("topright", legend = "End of Burn-in", col = "red", lty = 2)

# posterior histogram for the daily realized volatility coefficient
hist(results$beta[, 2], 
     probability=FALSE, # TRUE: frequency density, FALSE: counts
     breaks=30,
     col = "steelblue", 
     main = "Histogram of the posterior for Beta_d (daily rv)",
     ylab = "Frequency (counts)",
     xlab = expression(paste("Daily Persistence ", beta[d])) # R markdown subscript syntax
)

# Add a blue dashed line to mark the posterior mean 
abline(v = mean(results$beta[,2]), col = "blue", lty = 2, lwd = 2)
legend("topright", legend = "posterior mean", col = "blue", lty = 2)

4.2 Multiple chains (running the algorithm multiple times in parallel)

library(parallel)

# Run 2 chains (i.e. running the MCMC algorithm twice)
n_chains <- 2


# function to run chains in parallel (Mac/Linux use mclapply; Windows requires parLapply)
chains_list <- mclapply(1:n_chains, function(i) {
  gibbs_har(y, X, n_iter = 10000, burnin = 2000)
}, mc.cores = n_chains)



# lapply(): applies a function over a list or vector
chains_out <- lapply(1:n_chains, function(i) {
  # Start from a different random beta 
  # This helps verify if they converge to the same posterior
  beta_init <- rnorm(ncol(X), 0, 1) 
  
  # Call your function (make sure it returns the full n_iter)
  gibbs_har(y, X, n_iter = 10000, burnin = 2000)
})


str( chains_out ) # displaying the structure of the list object
## List of 2
##  $ :List of 2
##   ..$ beta  : num [1:8000, 1:4] 0.00722 0.00849 0.00811 0.01331 0.00822 ...
##   ..$ sigma2: num [1:8000] 0.0347 0.0293 0.0322 0.0319 0.0298 ...
##  $ :List of 2
##   ..$ beta  : num [1:8000, 1:4] 0.0173 0.0104 0.0128 0.0186 0.0102 ...
##   ..$ sigma2: num [1:8000] 0.0337 0.0333 0.03 0.0319 0.0305 ...
# comparing the posterior colMeans of both chains
colMeans(chains_out[[1]]$beta)
## [1] 0.01305197 0.23642811 0.43025828 0.09084691
colMeans(chains_out[[2]]$beta)
## [1] 0.01314847 0.23619552 0.43098901 0.08942888