Section 1: data preparation

The log return or equivalently the continuous grow rate from $P_1$ to $P2$ is
\[ log(P_2) - log(P_2) = log(\frac{P2}{P1})\]. F.ex. consider log(96/95)=log(1.01053)~0.0105, which equals a growth rate of 1.05%. The percentage change would be \[\frac{P2-P1}{P1}=\frac{96-95}{95}~0.0105\], so we can see that this approximation works.

# install libraries
if (!require("quantmod")) install.packages("quantmod")

## Loading required package: quantmod

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'quantmod'

## Installing package into '/Users/runner/work/_temp/Library'
## (as 'lib' is unspecified)

## also installing the dependencies 'xts', 'zoo', 'TTR'

## 
## The downloaded binary packages are in
##  /var/folders/tb/y368xp_x10s3ty1b_mtl5mxr0000gn/T//RtmpV9w6QE/downloaded_packages

# Load required libraries
suppressMessages({ library(quantmod); library(xts)
})
# Download S&P 500 index data (2020-2023, i.e. 4 years)
getSymbols("^GSPC", from="2020-01-01", to="2023-12-31", auto.assign=TRUE, warnings=FALSE)

## [1] "GSPC"

prices <- Cl(GSPC) # closing prices (252*4 = 1008)

print(length(prices))

## [1] 1006

# log(P_t) - log(P_{t-1}) = approximation of a percentage change (for small changes)
# continuously compounded rate of return (i.e. the log return)
returns <- diff(log(prices))[-1]  

print(returns)

##               GSPC.Close
## 2020-01-03 -0.0070849094
## 2020-01-06  0.0035271452
## 2020-01-07 -0.0028071751
## 2020-01-08  0.0048904735
## 2020-01-09  0.0066332141
## 2020-01-10 -0.0028592625
## 2020-01-13  0.0069519940
## 2020-01-14 -0.0015156808
## 2020-01-15  0.0018684502
## 2020-01-16  0.0083317474
##        ...              
## 2023-12-15 -0.0000762524
## 2023-12-18  0.0045181223
## 2023-12-19  0.0058492675
## 2023-12-20 -0.0147931480
## 2023-12-21  0.0102487693
## 2023-12-22  0.0016586822
## 2023-12-26  0.0042227610
## 2023-12-27  0.0014294356
## 2023-12-28  0.0003701061
## 2023-12-29 -0.0028304770

print(prices)

##            GSPC.Close
## 2020-01-02    3257.85
## 2020-01-03    3234.85
## 2020-01-06    3246.28
## 2020-01-07    3237.18
## 2020-01-08    3253.05
## 2020-01-09    3274.70
## 2020-01-10    3265.35
## 2020-01-13    3288.13
## 2020-01-14    3283.15
## 2020-01-15    3289.29
##        ...           
## 2023-12-15    4719.19
## 2023-12-18    4740.56
## 2023-12-19    4768.37
## 2023-12-20    4698.35
## 2023-12-21    4746.75
## 2023-12-22    4754.63
## 2023-12-26    4774.75
## 2023-12-27    4781.58
## 2023-12-28    4783.35
## 2023-12-29    4769.83

# Calculate realized variance (proxy using squared returns)
rv <- as.numeric(returns^2 * 252) # Annualized realized variance 

rv <- rv[!is.na(rv)] # Remove NAs

head(rv)

## [1] 0.012649377 0.003135070 0.001985818 0.006027016 0.011087881 0.002060196

print( length(rv) )

## [1] 1005

# Create HAR regressors
n <- length(rv) 

start_idx <- 23

# Pre-allocate matrices
n_obs <- n - start_idx + 1

y <- numeric(n_obs); X <- matrix(NA, n_obs, 4)

length(y)

## [1] 983

colnames(X) <- c("Intercept", "Daily", "Weekly", "Monthly")

# Fill data matrices
for(i in 1:n_obs) {

t <- start_idx + i - 1
# Dependent variable
y[i] <- rv[t]
# Regressors (regressor matrix)
X[i, 1] <- 1 # intercept
X[i, 2] <- rv[t-1] # daily rv
X[i, 3] <- mean(rv[(t-5):(t-1)]) # weekly realized variance (5 trading days)
X[i, 4] <- mean(rv[(t-22):(t-1)])  # monthly rv

}

T_obs <- length(y)

dim(X) # shape of the regressor matrix

## [1] 983   4

length(y) # length of the dep var vector

## [1] 983

head(y) # dependent variable

## [1] 0.0315419306 0.0027779043 0.0073905408 0.0134279399 0.0007172256
## [6] 0.0104573628

head(X) # regressor matrix

##      Intercept        Daily     Weekly    Monthly
## [1,]         1 0.0557163321 0.03039275 0.01523969
## [2,]         1 0.0315419306 0.03666322 0.01609844
## [3,]         1 0.0027779043 0.03672521 0.01608221
## [4,]         1 0.0073905408 0.02211875 0.01632788
## [5,]         1 0.0134279399 0.02217093 0.01666428
## [6,]         1 0.0007172256 0.01117111 0.01619289

Section 2: MCMC functions

The Gibbs sampler generates a Markov chain with stationary distribution $f$ from a at least two dimensional probability density function (or probability mass function) $f$ on the discrete, finite sample space $\Omega \in \mathbb{R}^{d}, d \geq 2$ by sequentially updating coordinates of $\boldsymbol{x} \in \Omega$. We don’t need proposal functions because the Gibbs sampler draws from the conditional distribution of $X_k|X_{-k}=x_k$ to update entry $k$ of $\boldsymbol{x}$. In this case, we use a systematic-scan Gibbs-sampler because we cycle through the parameters of interest, $\beta$ and $\sigma^{2}$ in a fixed order.

# Gibbs sampler for Bayesian HAR
gibbs_har <- function(y, 
X, 
n_iter = 10000, 
burnin = 2000
) 
{ # Prior parameters
beta_0 <- c(0, 0.4, 0.3, 0.2) # Prior means
V_0 <- diag(c(100, 0.04, 0.04, 0.04)) # Prior covariance 
a_0 <- 2; b_0 <- 1 # InvGamma parameters for sigma^2

a_0

# Initialize
k <- ncol(X); T_obs <- length(y); beta <- beta_0; sigma2 <- 1



# Store only post-burnin samples
n_keep <- n_iter - burnin
beta_draws <- matrix(NA, n_keep, k); sigma2_draws <- numeric(n_keep)

# Precompute
V_0_inv <- solve(V_0); XtX <- t(X) %*% X; Xty <- t(X) %*% y

##print(V_0_inv)


# V_n is the posterior variance-covariance matrix of the regressors; diagonal elements: var, off-diagonal elements: covariances between the coefficients
# beta_n and V_n are the parameters of the multivariate normal distribution of the beta parameter
# a_n and b_n are the parameters of the inverse gamma distribution of the sigma^2 parameter

# main loop (systematic scan Gibs-sampler)
for (iter in 1:n_iter) {
# Update beta | sigma2, y
V_n_inv <- V_0_inv + XtX / sigma2
V_n <- solve(V_n_inv) 
beta_n <- V_n %*% (V_0_inv %*% beta_0 + Xty / sigma2) 
beta <- drop(beta_n + t(chol(V_n)) %*% rnorm(k))
# Update sigma2 | beta, y
residuals <- y - X %*% beta
a_n <- a_0 + T_obs / 2 
b_n <- b_0 + sum(residuals^2) / 2 
sigma2 <- 1 / rgamma(1, a_n, b_n)

# Store post-burnin draws only
if (iter > burnin) {
beta_draws[iter - burnin, ] <- beta; sigma2_draws[iter - burnin] <- sigma2
  } # end if function

 } # end for loop

return(list(beta = beta_draws,
              sigma2 = sigma2_draws))

} # end function body

Section 3: extracting the results

# results$beta and results$sigma2 contain the empirical (posterior) distributions of the two estimated parameters of interest
results <- gibbs_har(y, X)

colMeans(results$beta) # posterior means of the volatility persistence params: intercept, daily rv, weekly rv, monthly rv

## [1] 0.01323416 0.23584119 0.43132166 0.08930988

mean(results$sigma2) # posterior mean for the error variane

## [1] 0.03167416

Section 4: MCMC diagnostics

Recap of the Markov Chain Monte Carlo algorithm:

a sequence of random variables $\{X_t\}$ is produced
this sequence is a Markov Chain
the larger $t$ is, the more similar the distribution of $X_t$ is to the target distribution
two terms in the chain, $X_t$ and $X_{t+n}, become almost independent as $n$ gets large
when we run a MCMC algorithm for $T$ periods, we obtain a MCMC sample of the first $T$ realizations of the chain, \[X_1,\ldots,X_T\]
we then use the empirical distribution of this MCMC sample to approximate the target distribution

Two main problems arise:

(P1) A large portion of the sample is drawn from distributions that are significantly different from the target distribution
(P2) The effective size of the sample is too small

(P1) can be addressed by discarding a large chunk of initial observations (burn-in sample) or by increasing the no. of iterations (MCMC sample size)

Markov Chain Monte Carlo (MCMC) diagnostics are tools that can be used to check whether the quality of a sample generated with an MCMC algorithm is sufficient to provide an accurate approximation of the target distribution.

In particular, MCMC diagnostics are used to check:

whether a large portion of the MCMC sample has been drawn from distributions that are significantly different from the target distribution
whether the size of the generated sample is too small.

Diagnostic tools:

The simplest way to diagnose problems in an MCMC sample is to split the sample into two or more chunks and check whether we get the same results on all the chunks.
Another simple way to diagnose problems is to run the MCMC algorithm more than once with different (possibly, very different) starting points $x_{1}$, so as to obtain multiple MCMC samples.
Trace plots may help us to understand what kind of problem (if any) affects our MCMC sample

4.1 Trace plots

# trace plot for the daily realized volatility coefficient, i.e. the path the sampler visited to map out the posterior distribution
plot(results$beta[, 2], type = "l", col = "steelblue", 
     main = "Trace Plot for beta_d (daily rv)",
     xlab = "MCMC Iteration (t)", ylab = "Value")

trace plot of all mcmc samples including the burn-in samples

# new function call with burn-in samples
results <- gibbs_har(y, X, 
n_iter = 10000, 
burnin = 0 # setting the no. of burnin samples to 0 to plot all mcmc samples
)

# Now results$beta contains all 10,000 iterations starting from iteration 1
plot(results$beta[, 2], type = "l", col = "steelblue", 
main = "Trace Plot for Beta_d (daily rv) with burn-in")

# Add a red dashed line to mark the end of burn-in
abline(v = 2000, col = "red", lty = 2, lwd = 2)
legend("topright", legend = "End of Burn-in", col = "red", lty = 2)

# posterior histogram for the daily realized volatility coefficient
hist(results$beta[, 2], 
     probability=FALSE, # TRUE: frequency density, FALSE: counts
     breaks=30,
     col = "steelblue", 
     main = "Histogram of the posterior for Beta_d (daily rv)",
     ylab = "Frequency (counts)",
     xlab = expression(paste("Daily Persistence ", beta[d])) # R markdown subscript syntax
)

# Add a blue dashed line to mark the posterior mean 
abline(v = mean(results$beta[,2]), col = "blue", lty = 2, lwd = 2)
legend("topright", legend = "posterior mean", col = "blue", lty = 2)

4.2 Multiple chains (running the algorithm multiple times in parallel)

library(parallel)

# Run 2 chains (i.e. running the MCMC algorithm twice)
n_chains <- 2


# function to run chains in parallel (Mac/Linux use mclapply; Windows requires parLapply)
chains_list <- mclapply(1:n_chains, function(i) {
  gibbs_har(y, X, n_iter = 10000, burnin = 2000)
}, mc.cores = n_chains)



# lapply(): applies a function over a list or vector
chains_out <- lapply(1:n_chains, function(i) {
  # Start from a different random beta 
  # This helps verify if they converge to the same posterior
  beta_init <- rnorm(ncol(X), 0, 1) 
  
  # Call your function (make sure it returns the full n_iter)
  gibbs_har(y, X, n_iter = 10000, burnin = 2000)
})


str( chains_out ) # displaying the structure of the list object

## List of 2
##  $ :List of 2
##   ..$ beta  : num [1:8000, 1:4] 0.00722 0.00849 0.00811 0.01331 0.00822 ...
##   ..$ sigma2: num [1:8000] 0.0347 0.0293 0.0322 0.0319 0.0298 ...
##  $ :List of 2
##   ..$ beta  : num [1:8000, 1:4] 0.0173 0.0104 0.0128 0.0186 0.0102 ...
##   ..$ sigma2: num [1:8000] 0.0337 0.0333 0.03 0.0319 0.0305 ...

# comparing the posterior colMeans of both chains
colMeans(chains_out[[1]]$beta)

## [1] 0.01305197 0.23642811 0.43025828 0.09084691

colMeans(chains_out[[2]]$beta)

## [1] 0.01314847 0.23619552 0.43098901 0.08942888