Rewrite mlsd to better use the time information (#87)

* Rewrite mlsd to better use the time information * Fix tests and example * Update the documentation
mpiktas · Aug 21, 2022 · 6495222 · 6495222
1 parent 665961f
commit 6495222
Show file tree

Hide file tree

Showing 32 changed files with 237 additions and 161 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Title: Mixed Data Sampling Regression
 Description: Methods and tools for mixed frequency time series data analysis.
     Allows estimation, model selection and forecasting for MIDAS regressions.
 URL: http://mpiktas.github.io/midasr/
-Version: 0.8.1
+Version: 0.8.2
 Maintainer: Vaidotas Zemlys-Balevičius <zemlys@gmail.com>
 Author: Virmantas Kvedaras <virmantas.kvedaras@ec.europa.eu>, Vaidotas Zemlys-Balevicius
    <zemlys@gmail.com>
@@ -30,7 +30,7 @@ Suggests:
     testthat,
     lubridate,
     xts
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
 Encoding: UTF-8
 Collate: 
     'deriv.R'

diff --git a/R/midaslag.R b/R/midaslag.R
@@ -9,7 +9,7 @@
 ##' @return a matrix containing the lags
 ##' @author Virmantas Kvedaras, Vaidotas Zemlys
 ##' @seealso mls
-##' @details This is a convenience function, it calls \code{link{msl}} to perform actual calculations.
+##' @details This is a convenience function, it calls \code{link{mls}} to perform actual calculations.
 ##' @export
 ##'
 fmls <- function(x, k, m, ...) {
@@ -88,7 +88,9 @@ dmls <- function(x, k, m, ...) {
 ##'
 ##' @param data a list containing mixed frequency data
 ##' @return a boolean TRUE, if mixed frequency data is conformable, FALSE if it is not.
-##' @details The number of observations in higher frequency data elements should have a common divisor with the number of observations in response variable. It is always assumed that the response variable is of the lowest frequency.
+##' @details The number of observations in higher frequency data elements should have a common divisor
+##'  with the number of observations in response variable. It is always assumed that the response variable
+##'  is of the lowest frequency.
 ##'
 ##' @author Virmantas Kvedaras, Vaidotas Zemlys
 ##' @export
@@ -99,80 +101,72 @@ check_mixfreq <- function(data) {
   sum(m > 0) == 0
 }
 
+
 #' MIDAS lag structure with dates
 #'
-#' @param x a vector
+#' @param x a vector, of high frequency time series. Must be zoo or ts object
 #' @param k lags, a vector
-#' @param datey low frequency dates
+#' @param y a vector of low frequency time series. Must be zoo or ts object
 #' @param ... further arguments used in fitting MIDAS regression
 #'
-#' @return a matrix containing the first differences and the lag k+1.
+#' @return a matrix containing the lags
+#' @details High frequency time series is aligned with low frequency time series using date information.
+#' Then the high frequency lags are calculated.
+#'
+#' To align the time series the low frequency series index
+#' needs to be extended by one low frequency period into the past and into the future. If supplied time series
+#' object does not support extending time index, a simple heuristic is used.
+#'
+#' It is expected that time index for zoo objects can be converted to POSIXct format.
+#'
+#'
 #' @author Virmantas Kvedaras, Vaidotas Zemlys-Balevičius
 #' @importFrom stats lag
 #' @export
 #'
 #' @examples
-#' x <- c(1:144)
-#' y <- c(1:12)
-#' datey <- (y - 1) * 12 + 1
+#'
+#' # Example with ts objects
+#' x <- ts(c(1:144), start = c(1980, 1), frequency = 12)
+#' y <- ts(c(1:12), start = 1980, frequency = 1)
+#'
 #'
 #' # msld and mls should give the same results
 #'
-#' m1 <- mlsd(x, 0:5, datey)
+#' m1 <- mlsd(x, 0:5, y)
 #'
 #' m2 <- mls(x, 0:5, 12)
 #'
 #' sum(abs(m1 - m2))
-mlsd <- function(x, k, datey, ...) {
-  datex <- NULL
-  if (inherits(x, "ts")) datex <- time(x)
-  if (inherits(x, "zoo") | inherits(x, "xts")) datex <- index(x)
+#'
+#' # Example with zooreg
+#'
+#' # Convert x to zooreg object using yearmon time index
+#' \dontrun{
+#' xz <- zoo::as.zooreg(x)
+#'
+#' yz <- zoo::zoo(as.numeric(y), order.by = as.Date(paste0(1980 + 0:11, "-01-01")))
+#'
+#' # Heuristic works here
+#' m3 <- mlsd(xz, 0:5, yz)
+#'
+#' sum(abs(m3 - m1))
+#' }
+mlsd <- function(x, k, y, ...) {
+  datex <- get_datex(x)
 
-  x <- as.numeric(x)
-  if (is.null(datex)) datex <- 1:length(x)
-
-  if (length(x) != length(datex)) stop("The date vector for high frequency data must be the same length as a data")
-
-  ## We always assume that if we observe data for low frequency period
-  ## the high frequency data is observed until the end of that period.
-  ## We build the indexes of the high frequency observations.
-  ## For that we need to capture all the x observations, hence the minimum period
-  ## must be the minimum x period.
-  ## The max period needs to be the next period after the maximum datey period.
-  ## If there is any data which falls into first period, we can discard it.
-
-  if (inherits(datey, "ts")) {
-    datey0 <- datey
-    left <- min(time(lag(datey, 1)))
-    right <- max(time(lag(datey, -1)))
-    if (min(datex) < left) left <- min(datex)
-    datey <- c(left, time(datey), right) - 0.001
-    datey0 <- time(datey0) - 0.001
-  } else {
-    if (inherits(datey, "zoo") | inherits(datey, "xts")) {
-      datey <- index(datey)
-    }
-    datey0 <- datey
-    left <- datey[1] - (datey[2] - datey[1])
-    if (min(datex) < left) left <- min(datex)
-    nd <- length(datey)
-    right <- datey[nd] + (datey[nd] - datey[nd - 1])
-    datey <- c(left, datey, right)
-  }
+  datey <- get_datey(y, datex)
 
+  x <- as.numeric(x)
 
   ct <- cut(datex, datey, right = FALSE, labels = FALSE, include.lowest = TRUE)
   tct <- table(ct)
   uct <- unique(ct)
   nuct <- na.omit(uct)
-
   # We do not need the first period, but sometimes it is matched.
   # In that case it is dropped.
   id <- match(2:(length(datey) - 1), nuct)
 
-  # if (length(uct) != length(datey0)) {
-  #    id <- id[-1]
-  # }
   fhx <- function(h.x) {
     id <- h.x - k
     id[id <= 0] <- NA
@@ -183,3 +177,48 @@ mlsd <- function(x, k, datey, ...) {
   colnames(X) <- paste("X", k, sep = ".")
   X[id, ]
 }
+
+get_datex <- function(x) UseMethod("get_datex")
+
+get_datex.zoo <- function(x) {
+  as.POSIXct(index(x))
+}
+
+get_datex.ts <- function(x) {
+  time(x)
+}
+
+get_datey <- function(y, datex) UseMethod("get_datey")
+
+get_datey.ts <- function(y, datex) {
+  left <- time(lag(y, 1))[1]
+  right <- tail(time(lag(y, -1)), n = 1)
+  if (datex[1] < left) left <- datex[1]
+  c(left, time(y), right) - 0.001
+}
+
+get_datey.default <- function(datey, datex) {
+  ## If we get here, we assume that both datey and datex are ordered and comparable
+  left <- datey[1] - (datey[2] - datey[1])
+  if (datex[1] < left) left <- datex[1]
+  nd <- length(datey)
+  right <- datey[nd] + (datey[nd] - datey[nd - 1])
+  c(left, datey, right)
+}
+
+get_datey.zoo <- function(datey, datex) {
+  ## Test whether the lag extends the dates
+  lagy <- lag(datey, 1)
+  fd_lagy <- index(lagy)[1]
+  fd_y <- index(datey)[1]
+  datey_p <- as.POSIXct(index(datey))
+  ## If the dates are not extended use heuristic for left and right low frequency dates
+  if (fd_lagy == fd_y) {
+    get_datey.default(datey_p, datex)
+  } else {
+    left <- as.POSIXct(fd_lagy)
+    right <- as.POSIXct(tail(index(lag(datey, -1)), n = 1))
+    if (datex[1] < left) left <- datex[1]
+    return(c(left, datey_p, right))
+  }
+}
diff --git a/man/USrealgdp.Rd b/man/USrealgdp.Rd
diff --git a/man/amweights.Rd b/man/amweights.Rd
diff --git a/man/average_forecast.Rd b/man/average_forecast.Rd
diff --git a/man/check_mixfreq.Rd b/man/check_mixfreq.Rd
diff --git a/man/deriv_tests.Rd b/man/deriv_tests.Rd
diff --git a/man/fmls.Rd b/man/fmls.Rd
diff --git a/man/genexp.Rd b/man/genexp.Rd
diff --git a/man/genexp_gradient.Rd b/man/genexp_gradient.Rd
diff --git a/man/hAhr_test.Rd b/man/hAhr_test.Rd
diff --git a/man/harstep.Rd b/man/harstep.Rd
diff --git a/man/harstep_gradient.Rd b/man/harstep_gradient.Rd