From a104deacfb84e2638bce7419c545b0bd8d777d70 Mon Sep 17 00:00:00 2001
From: Bernie Wang <yuyawang@amazon.com>
Date: Tue, 11 Feb 2020 14:22:44 -0800
Subject: [PATCH 01/44] more features for MQDNN, and Refactoring, remove of
 ts-fields from data-entry.

---
 src/gluonts/block/encoder.py                  |  23 ++--
 .../model/seq2seq/_forking_estimator.py       |  14 +-
 src/gluonts/model/seq2seq/_forking_network.py |  14 +-
 .../model/seq2seq/_mq_dnn_estimator.py        |  95 ++++++++++---
 .../model/seq2seq/_seq2seq_estimator.py       |   2 +-
 src/gluonts/model/seq2seq/_transform.py       | 130 +++++++++++-------
 test/model/seq2seq/test_encoders.py           |  20 ++-
 .../seq2seq/test_forking_sequence_splitter.py |  87 ++++++++++--
 8 files changed, 279 insertions(+), 106 deletions(-)

diff --git a/src/gluonts/block/encoder.py b/src/gluonts/block/encoder.py
index 5245639b2e..2a3e4ee3a6 100644
--- a/src/gluonts/block/encoder.py
+++ b/src/gluonts/block/encoder.py
@@ -93,7 +93,7 @@ def _assemble_inputs(
 
         target
             target time series,
-            shape (batch_size, sequence_length)
+            shape (batch_size, sequence_length, 1)
 
         static_features
             static features,
@@ -111,7 +111,6 @@ def _assemble_inputs(
                    num_static_features + num_dynamic_features + 1)
 
         """
-        target = target.expand_dims(axis=-1)  # (N, T, 1)
 
         helper_ones = F.ones_like(target)  # Ones of (N, T, 1)
         tiled_static_features = F.batch_dot(
@@ -156,7 +155,8 @@ def __init__(
         kernel_size_seq: List[int],
         channels_seq: List[int],
         use_residual: bool = False,
-        use_covariates: bool = False,
+        use_static_feat: bool = False,
+        use_dynamic_feat: bool = False,
         **kwargs,
     ) -> None:
         assert all(
@@ -172,7 +172,8 @@ def __init__(
         super().__init__(**kwargs)
 
         self.use_residual = use_residual
-        self.use_covariates = use_covariates
+        self.use_static_feat = use_static_feat
+        self.use_dynamic_feat = use_dynamic_feat
         self.cnn = nn.HybridSequential()
 
         it = zip(channels_seq, kernel_size_seq, dilation_seq)
@@ -203,7 +204,7 @@ def hybrid_forward(
 
         target
             target time series,
-            shape (batch_size, sequence_length)
+            shape (batch_size, sequence_length, 1)
 
         static_features
             static features,
@@ -224,13 +225,17 @@ def hybrid_forward(
             shape (batch_size, sequence_length, num_dynamic_features)
         """
 
-        if self.use_covariates:
+        if self.use_dynamic_feat and self.use_static_feat:
             inputs = Seq2SeqEncoder._assemble_inputs(
                 F,
                 target=target,
                 static_features=static_features,
                 dynamic_features=dynamic_features,
             )
+        elif self.use_dynamic_feat:
+            inputs = F.concat(
+            target, dynamic_features, dim=2
+        )  # (N, T, C)
         else:
             inputs = target
 
@@ -302,7 +307,7 @@ def hybrid_forward(
 
         target
             target time series,
-            shape (batch_size, sequence_length)
+            shape (batch_size, sequence_length, 1)
 
         static_features
             static features,
@@ -442,7 +447,7 @@ def hybrid_forward(
 
         target
             target time series,
-            shape (batch_size, sequence_length)
+            shape (batch_size, sequence_length, 1)
 
         static_features
             static features,
@@ -473,4 +478,4 @@ def hybrid_forward(
             F.slice_axis(dynamic_code, axis=1, begin=-1, end=None), axis=1
         )
 
-        return static_code, dynamic_code
+        return static_code, dynamic_code
\ No newline at end of file
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 207d4a5126..9f05f7aefd 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -29,12 +29,14 @@
 from gluonts.trainer import Trainer
 from gluonts.transform import (
     AsNumpyArray,
+    AddAgeFeature,
     Chain,
     TestSplitSampler,
     Transformation,
 )
 
 # Relative imports
+# from transform import AddAgeFeature
 from ._forking_network import (
     ForkingSeq2SeqPredictionNetwork,
     ForkingSeq2SeqTrainingNetwork,
@@ -115,11 +117,21 @@ def __init__(
     def create_transformation(self) -> Transformation:
         return Chain(
             trans=[
-                AsNumpyArray(field=FieldName.TARGET, expected_ndim=1),
+                AsNumpyArray(
+                    field=FieldName.TARGET, expected_ndim=1, dtype=self.dtype
+                ),
+                AddAgeFeature(
+                    target_field=FieldName.TARGET,
+                    output_field=FieldName.FEAT_DYNAMIC_REAL,
+                    log_scale=True,
+                    pred_length=self.prediction_length,
+                    dtype=self.dtype,
+                ),
                 ForkingSequenceSplitter(
                     train_sampler=TestSplitSampler(),
                     enc_len=self.context_length,
                     dec_len=self.prediction_length,
+                    encoder_series_fields=[FieldName.FEAT_DYNAMIC_REAL],
                 ),
             ]
         )
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 478c720435..359bc5a9ea 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -67,7 +67,7 @@ def __init__(
 class ForkingSeq2SeqTrainingNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
     def hybrid_forward(
-        self, F, past_target: Tensor, future_target: Tensor
+        self, F, past_target: Tensor, past_feat_dynamic_real: Tensor, future_target: Tensor
     ) -> Tensor:
         """
         Parameters
@@ -84,9 +84,13 @@ def hybrid_forward(
         loss with shape (FIXME, FIXME)
         """
 
+        # print(f"past target: {past_target.shape}")
+        # print(f"past_feat_dynamic_real: {past_feat_dynamic_real.shape}")
+        # print(f"future_target: {future_target.shape}")
+
         # FIXME: can we factor out a common prefix in the base network?
         feat_static_real = nd_None
-        past_feat_dynamic_real = nd_None
+        # past_feat_dynamic_real = nd_None
         future_feat_dynamic_real = nd_None
 
         enc_output_static, enc_output_dynamic = self.encoder(
@@ -100,13 +104,15 @@ def hybrid_forward(
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
         dec_dist_output = self.quantile_proj(dec_output)
 
+        # print(f"decoder output: {dec_dist_output.shape}")
+
         loss = self.loss(future_target, dec_dist_output)
         return loss.mean(axis=1)
 
 
 class ForkingSeq2SeqPredictionNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
-    def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
+    def hybrid_forward(self, F, past_target: Tensor, past_feat_dynamic_real: Tensor) -> Tensor:
         """
         Parameters
         ----------
@@ -122,7 +128,7 @@ def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
 
         # FIXME: can we factor out a common prefix in the base network?
         feat_static_real = nd_None
-        past_feat_dynamic_real = nd_None
+        # past_feat_dynamic_real = nd_None
         future_feat_dynamic_real = nd_None
 
         enc_output_static, enc_output_dynamic = self.encoder(
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index d7a8854258..c89854dca1 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -15,6 +15,7 @@
 from typing import List, Optional
 
 # First-party imports
+from gluonts.evaluation.backtest import make_evaluation_predictions
 from gluonts.block.decoder import ForkingMLPDecoder
 from gluonts.block.encoder import (
     HierarchicalCausalConv1DEncoder,
@@ -26,7 +27,10 @@
 from gluonts.trainer import Trainer
 
 # Relative imports
-from ._forking_estimator import ForkingSeq2SeqEstimator
+from gluonts.model.seq2seq._forking_estimator import ForkingSeq2SeqEstimator
+from gluonts.evaluation import Evaluator
+import numpy as np
+import mxnet as mx
 
 
 class MQDNNEstimator(ForkingSeq2SeqEstimator):
@@ -44,9 +48,7 @@ def __init__(
         context_length: Optional[int],
         prediction_length: int,
         freq: str,
-        # FIXME: why do we have two parameters here?
-        mlp_final_dim: int = 20,
-        mlp_hidden_dimension_seq: List[int] = list(),
+        decoder_mlp_dim_seq: List[int] = [20],
         quantiles: List[float] = list(),
         trainer: Trainer = Trainer(),
     ) -> None:
@@ -54,13 +56,13 @@ def __init__(
             prediction_length if context_length is None else context_length
         )
         assert all(
-            [d > 0 for d in mlp_hidden_dimension_seq]
+            [d > 0 for d in decoder_mlp_dim_seq]
         ), "Elements of `mlp_hidden_dimension_seq` should be > 0"
 
         decoder = ForkingMLPDecoder(
             dec_len=prediction_length,
-            final_dim=mlp_final_dim,
-            hidden_dimension_sequence=mlp_hidden_dimension_seq,
+            final_dim=decoder_mlp_dim_seq[-1],
+            hidden_dimension_sequence=decoder_mlp_dim_seq[:-1],
             prefix="decoder_",
         )
 
@@ -89,25 +91,40 @@ def __init__(
         prediction_length: int,
         freq: str,
         context_length: Optional[int] = None,
-        # FIXME: prefix those so clients know that these are decoder params
-        mlp_final_dim: int = 20,
-        mlp_hidden_dimension_seq: List[int] = list(),
+        seed: Optional[int] = None,
+        decoder_mlp_dim_seq: List[int] = [20],
+        channels_seq: List[int] = [30, 30, 30],
+        dilation_seq: List[int] = [1, 3, 9],
+        kernel_size_seq: List[int] = [3, 3, 3],
+        use_residual: bool = True,
         quantiles: List[float] = list(
             [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
         ),
         trainer: Trainer = Trainer(),
     ) -> None:
+
+        if seed:
+            np.random.seed(seed)
+            mx.random.seed(seed)
+
+        assert (
+            len(channels_seq) == len(dilation_seq) == len(kernel_size_seq)
+        ), (
+            f"mismatch CNN configurations: {len(channels_seq)} vs. "
+            f"{len(dilation_seq)} vs. {len(kernel_size_seq)}"
+        )
+
         encoder = HierarchicalCausalConv1DEncoder(
-            dilation_seq=[1, 3, 9],
-            kernel_size_seq=([3] * len([30, 30, 30])),
-            channels_seq=[30, 30, 30],
-            use_residual=True,
+            dilation_seq=dilation_seq,
+            kernel_size_seq=channels_seq,
+            channels_seq=kernel_size_seq,
+            use_residual=use_residual,
+            use_dynamic_feat=True,
             prefix="encoder_",
         )
         super(MQCNNEstimator, self).__init__(
             encoder=encoder,
-            mlp_final_dim=mlp_final_dim,
-            mlp_hidden_dimension_seq=mlp_hidden_dimension_seq,
+            decoder_mlp_dim_seq=decoder_mlp_dim_seq,
             freq=freq,
             prediction_length=prediction_length,
             trainer=trainer,
@@ -128,9 +145,7 @@ def __init__(
         prediction_length: int,
         freq: str,
         context_length: Optional[int] = None,
-        # FIXME: prefix those so clients know that these are decoder params
-        mlp_final_dim: int = 20,
-        mlp_hidden_dimension_seq: List[int] = list(),
+        decoder_mlp_dim_seq: List[int] = [20],
         trainer: Trainer = Trainer(),
         quantiles: List[float] = list([0.1, 0.5, 0.9]),
     ) -> None:
@@ -143,11 +158,49 @@ def __init__(
         )
         super(MQRNNEstimator, self).__init__(
             encoder=encoder,
-            mlp_final_dim=mlp_final_dim,
-            mlp_hidden_dimension_seq=mlp_hidden_dimension_seq,
+            decoder_mlp_dim_seq=decoder_mlp_dim_seq,
             freq=freq,
             prediction_length=prediction_length,
             trainer=trainer,
             context_length=context_length,
             quantiles=quantiles,
         )
+
+
+if __name__ == "__main__":
+    from gluonts.dataset.repository.datasets import (
+        get_dataset,
+        dataset_recipes,
+    )
+
+    print(f"datasets available: {dataset_recipes.keys()}")
+
+    # we pick m4_hourly as it only contains a few hundred time series
+    dataset = get_dataset("m4_hourly", regenerate=False)
+
+    metrics = []
+
+    for _ in range(1):
+        estimator = MQCNNEstimator(
+            prediction_length=dataset.metadata.prediction_length,
+            seed=42,
+            freq=dataset.metadata.freq,
+            quantiles=[0.5],
+            trainer=Trainer(
+                epochs=1, num_batches_per_epoch=10, hybridize=True
+            ),
+        )
+
+        predictor = estimator.train(dataset.train)
+
+        forecast_it, ts_it = make_evaluation_predictions(
+            dataset.test, predictor=predictor, num_samples=100
+        )
+
+        agg_metrics, item_metrics = Evaluator()(
+            ts_it, forecast_it, num_series=len(dataset.test)
+        )
+
+        metrics.append(agg_metrics["wQuantileLoss[0.5]"])
+
+    print(metrics)
diff --git a/src/gluonts/model/seq2seq/_seq2seq_estimator.py b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
index c3a85e9b2d..fd1b0ba90d 100644
--- a/src/gluonts/model/seq2seq/_seq2seq_estimator.py
+++ b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
@@ -278,7 +278,7 @@ def __init__(
             kernel_size_seq=([3] * len([30, 30, 30])),
             channels_seq=[30, 30, 30],
             use_residual=True,
-            use_covariates=True,
+            use_dynamic_feat=True,
         )
 
         super(CNN2QRForecaster, self).__init__(
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index 06ce4ac918..8a147b0670 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -12,6 +12,7 @@
 # permissions and limitations under the License.
 
 # Standard library imports
+from collections import Counter
 from typing import Iterator, List
 
 # Third-party imports
@@ -23,6 +24,16 @@
 from gluonts.transform import FlatMapTransformation, shift_timestamp
 
 
+def pad_to_size(xs, size):
+    """Pads `xs` with 0 on the left on the last axis."""
+    pad_length = size - xs.shape[-1]
+    if pad_length <= 0:
+        return xs
+
+    pad_width = ([(0, 0)] * (xs.ndim - 1)) + [(pad_length, 0)]
+    return np.pad(xs, pad_width)
+
+
 class ForkingSequenceSplitter(FlatMapTransformation):
     """Forking sequence splitter."""
 
@@ -32,12 +43,14 @@ def __init__(
         train_sampler,
         enc_len: int,
         dec_len: int,
-        time_series_fields: List[str] = None,
-        target_in="target",
+        target_in: str = "target",
+        encoder_series_fields: List[str] = None,
+        decoder_series_fields: List[str] = [],
         is_pad_out: str = "is_pad",
-        start_in: str = "start",
-        forecast_start_out: str = "forecast_start",
+        start_input_field: str = "start",
+        forecast_start_output_field: str = "forecast_start",
     ) -> None:
+
         assert enc_len > 0, "The value of `enc_len` should be > 0"
         assert dec_len > 0, "The value of `dec_len` should be > 0"
 
@@ -45,12 +58,13 @@ def __init__(
         self.enc_len = enc_len
         self.dec_len = dec_len
         self.ts_fields = (
-            time_series_fields if time_series_fields is not None else []
+            encoder_series_fields if encoder_series_fields is not None else []
         )
         self.target_in = target_in
         self.is_pad_out = is_pad_out
-        self.start_in = start_in
-        self.forecast_start_out = forecast_start_out
+        self.start_in = start_input_field
+        self.forecast_start_out = forecast_start_output_field
+        self.decoder_series_fields = decoder_series_fields
 
     def _past(self, col_name):
         return f"past_{col_name}"
@@ -61,63 +75,77 @@ def _future(self, col_name):
     def flatmap_transform(
         self, data: DataEntry, is_train: bool
     ) -> Iterator[DataEntry]:
-        dec_len = self.dec_len
-        slice_cols = self.ts_fields + [self.target_in]
         target = data[self.target_in]
 
         if is_train:
+            # We currently cannot handle time series that are shorter than the
+            # prediction length during training, so we just skip these.
+            # If we want to include them we would need to pad and to mask
+            # the loss.
             if len(target) < self.dec_len:
-                # We currently cannot handle time series that are shorter than the
-                # prediction length during training, so we just skip these.
-                # If we want to include them we would need to pad and to mask
-                # the loss.
-                sampling_indices: List[int] = []
-            else:
-                sampling_indices = self.train_sampler(
-                    target, 0, len(target) - self.dec_len
-                )
+                return
+
+            sampling_indices = self.train_sampler(
+                target, 0, len(target) - self.dec_len
+            )
         else:
             sampling_indices = [len(target)]
 
-        for i in sampling_indices:
-            pad_length = max(self.enc_len - i, 0)
-
-            d = data.copy()
-            for ts_field in slice_cols:
-                if i > self.enc_len:
-                    # truncate to past_length
-                    past_piece = d[ts_field][..., i - self.enc_len : i]
-                elif i < self.enc_len:
-                    pad_block = np.zeros(
-                        d[ts_field].shape[:-1] + (pad_length,)
-                    )
-                    past_piece = np.concatenate(
-                        [pad_block, d[ts_field][..., :i]], axis=-1
-                    )
+        decoder_fields = set([self.target_in] + self.decoder_series_fields)
+
+        ts_fields_counter = Counter(
+            self.ts_fields + [self.target_in] + self.decoder_series_fields
+        )
+
+        for sampling_idx in sampling_indices:
+            # ensure start index is not negative
+            start_idx = max(0, sampling_idx - self.enc_len)
+
+            out = data.copy()
+
+            for ts_field in list(ts_fields_counter.keys()):
+
+                # target is 1d, this ensures ts is always 2d
+                ts = np.atleast_2d(out[ts_field])
+
+                if ts_fields_counter[ts_field] == 1:
+                    del out[ts_field]
                 else:
-                    past_piece = d[ts_field][..., :i]
+                    ts_fields_counter[ts_field] -= 1
+
+                # take enc_len values from ts, depending on sampling_idx
+                slice = ts[:, start_idx:sampling_idx]
 
-                d[self._past(ts_field)] = np.expand_dims(past_piece, -1)
+                # if we have less than enc_len values, pad_left with 0
+                past_piece = pad_to_size(slice, self.enc_len)
 
-                if is_train and ts_field is self.target_in:
+                out[f"past_{ts_field}"] = past_piece.transpose()
+
+                # in prediction mode, don't provide decode-values
+                if not is_train and ts_field == self.target_in:
+                    continue
+
+                if ts_field in decoder_fields:
+                    d3 = () if ts_field == self.target_in else (len(ts),)
                     forking_dec_field = np.zeros(
-                        shape=(self.enc_len, self.dec_len)
+                        shape=(self.enc_len, self.dec_len) + d3
                     )
 
-                    for j in range(self.enc_len):
-                        start_idx = i - self.enc_len + j + 1
-                        if start_idx >= 0:
-                            forking_dec_field[j, :] = d[ts_field][
-                                ..., start_idx : start_idx + dec_len
-                            ]
-
-                    d[self._future(ts_field)] = forking_dec_field
+                    skip = max(0, self.enc_len - 1 - sampling_idx)
+                    for dec_field, idx in zip(
+                        forking_dec_field[skip:], range(start_idx)
+                    ):
+                        dec_field[:] = ts[:, idx : idx + self.dec_len]
 
-                del d[ts_field]
+                    out[self._future(ts_field)] = forking_dec_field
 
             pad_indicator = np.zeros(self.enc_len)
-            if pad_length > 0:
-                pad_indicator[:pad_length] = 1
-            d[self._past(self.is_pad_out)] = pad_indicator
-            d[self.forecast_start_out] = shift_timestamp(d[self.start_in], i)
-            yield d
+            pad_length = max(0, self.enc_len - sampling_idx)
+            pad_indicator[:pad_length] = True
+            out[f"past_{self.is_pad_out}"] = pad_indicator
+
+            out[self.forecast_start_out] = shift_timestamp(
+                out[self.start_in], sampling_idx
+            )
+
+            yield out
diff --git a/test/model/seq2seq/test_encoders.py b/test/model/seq2seq/test_encoders.py
index 7b8e594119..96063864de 100644
--- a/test/model/seq2seq/test_encoders.py
+++ b/test/model/seq2seq/test_encoders.py
@@ -21,20 +21,30 @@
 nd_None = nd.array([])
 
 
-@pytest.mark.skip()
-def test_hierarchical_cnn_encoders() -> None:
+@pytest.mark.parametrize("use_residual", [True, False])
+@pytest.mark.parametrize("hybridize", [True, False])
+def test_hierarchical_cnn_encoders(use_residual, hybridize) -> None:
     num_ts = 2
     ts_len = 10
+    num_static_feat = 2
+    num_dynamic_feat = 5
+
     test_data = nd.arange(num_ts * ts_len).reshape(shape=(num_ts, ts_len, 1))
+    test_static_feat = nd.random.randn(num_ts, num_static_feat)
+    test_dynamic_feat = nd.random.randn(num_ts, ts_len, num_dynamic_feat)
 
     chl_dim = [30, 30, 30]
     ks_seq = [3] * len(chl_dim)
     dial_seq = [1, 3, 9]
 
     cnn = HierarchicalCausalConv1DEncoder(
-        dial_seq, ks_seq, chl_dim, use_residual=True
+        dial_seq, ks_seq, chl_dim, use_residual, use_dynamic_feat=True
     )
     cnn.collect_params().initialize()
-    cnn.hybridize()
 
-    print(cnn(test_data, nd_None, nd_None)[1].shape)
+    if hybridize:
+        cnn.hybridize()
+
+    true_shape = (num_ts, ts_len, 31) if use_residual else (num_ts, ts_len, 30)
+
+    assert cnn(test_data, test_static_feat, test_dynamic_feat)[1].shape == true_shape
\ No newline at end of file
diff --git a/test/model/seq2seq/test_forking_sequence_splitter.py b/test/model/seq2seq/test_forking_sequence_splitter.py
index f4bc9f37ec..2798074bdd 100644
--- a/test/model/seq2seq/test_forking_sequence_splitter.py
+++ b/test/model/seq2seq/test_forking_sequence_splitter.py
@@ -15,6 +15,8 @@
 import numpy as np
 
 # First-party imports
+import pytest
+
 from gluonts import transform
 from gluonts.dataset.common import ListDataset
 from gluonts.dataset.field_names import FieldName
@@ -22,23 +24,22 @@
 
 # if we import TestSplitSampler as Test... pytest thinks it's a test
 from gluonts.transform import TestSplitSampler as TSplitSampler
+from gluonts.time_feature import time_features_from_frequency_str
 
 
-def test_forking_sequence_splitter() -> None:
-    def make_dataset(N, train_length):
-        # generates 2 ** N - 1 timeseries with constant increasing values
-        n = 2 ** N - 1
+def make_dataset(N, train_length):
+    # generates 2 ** N - 1 timeseries with constant increasing values
+    n = 2 ** N - 1
 
-        targets = np.arange(n * train_length).reshape((n, train_length))
+    targets = np.arange(n * train_length).reshape((n, train_length))
 
-        return ListDataset(
-            [
-                {"start": "2012-01-01", "target": targets[i, :]}
-                for i in range(n)
-            ],
-            freq="D",
-        )
+    return ListDataset(
+        [{"start": "2012-01-01", "target": targets[i, :]} for i in range(n)],
+        freq="D",
+    )
 
+
+def test_forking_sequence_splitter() -> None:
     ds = make_dataset(1, 20)
 
     trans = transform.Chain(
@@ -50,9 +51,9 @@ def make_dataset(N, train_length):
             ),
             ForkingSequenceSplitter(
                 train_sampler=TSplitSampler(),
-                time_series_fields=["age"],
                 enc_len=5,
                 dec_len=3,
+                encoder_series_fields=["age"],
             ),
         ]
     )
@@ -84,9 +85,9 @@ def make_dataset(N, train_length):
             ),
             ForkingSequenceSplitter(
                 train_sampler=TSplitSampler(),
-                time_series_fields=["age"],
                 enc_len=20,
                 dec_len=20,
+                encoder_series_fields=["age"],
             ),
         ]
     )
@@ -97,3 +98,61 @@ def make_dataset(N, train_length):
         np.sum(transformed_data_oob["future_target"]) - np.sum(np.arange(20))
         < 1e-5
     ), "the forking sequence target should be computed correctly."
+
+
+@pytest.mark.parametrize("is_train", [True, False])
+def test_forking_sequence_with_features(is_train) -> None:
+    def make_dataset(N, train_length):
+        # generates 2 ** N - 1 timeseries with constant increasing values
+        n = 2 ** N - 1
+
+        targets = np.arange(n * train_length).reshape((n, train_length))
+
+        return ListDataset(
+            [
+                {"start": "2012-01-01", "target": targets[i, :]}
+                for i in range(n)
+            ],
+            freq="D",
+        )
+
+    ds = make_dataset(1, 20)
+
+    trans = transform.Chain(
+        trans=[
+            transform.AddAgeFeature(
+                target_field=FieldName.TARGET,
+                output_field=FieldName.FEAT_AGE,
+                pred_length=10,
+            ),
+            transform.AddTimeFeatures(
+                start_field=FieldName.START,
+                target_field=FieldName.TARGET,
+                output_field=FieldName.FEAT_TIME,
+                time_features=time_features_from_frequency_str("D"),
+                pred_length=10,
+            ),
+            ForkingSequenceSplitter(
+                train_sampler=TSplitSampler(),
+                enc_len=5,
+                dec_len=3,
+                target_in=FieldName.TARGET,
+                encoder_series_fields=[
+                    FieldName.FEAT_AGE,
+                    FieldName.FEAT_TIME,
+                ],
+                decoder_series_fields=[FieldName.FEAT_TIME],
+            ),
+        ]
+    )
+
+    out = trans(iter(ds), is_train=is_train)
+    transformed_data = next(iter(out))
+
+    assert transformed_data["past_target"].shape == (5, 1)
+    assert transformed_data["past_feat_dynamic_age"].shape == (5, 1)
+    assert transformed_data["past_time_feat"].shape == (5, 3)
+    assert transformed_data["future_time_feat"].shape == (5, 3, 3)
+
+    if is_train:
+        assert transformed_data["future_target"].shape == (5, 3)

From 56b150bb0cc171c0176c74078f4953d08b86927e Mon Sep 17 00:00:00 2001
From: Bernie Wang <yuyawang@amazon.com>
Date: Mon, 17 Feb 2020 17:21:00 -0800
Subject: [PATCH 02/44] fix the future target calculation

---
 src/gluonts/model/seq2seq/_transform.py       |  3 +-
 .../seq2seq/test_forking_sequence_splitter.py | 38 +++++++------------
 2 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index 8a147b0670..dc0ff84664 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -133,7 +133,8 @@ def flatmap_transform(
 
                     skip = max(0, self.enc_len - 1 - sampling_idx)
                     for dec_field, idx in zip(
-                        forking_dec_field[skip:], range(start_idx)
+                        forking_dec_field[skip:],
+                        range(start_idx + 1, start_idx + self.enc_len + 1),
                     ):
                         dec_field[:] = ts[:, idx : idx + self.dec_len]
 
diff --git a/test/model/seq2seq/test_forking_sequence_splitter.py b/test/model/seq2seq/test_forking_sequence_splitter.py
index 2798074bdd..2087f86551 100644
--- a/test/model/seq2seq/test_forking_sequence_splitter.py
+++ b/test/model/seq2seq/test_forking_sequence_splitter.py
@@ -40,19 +40,22 @@ def make_dataset(N, train_length):
 
 
 def test_forking_sequence_splitter() -> None:
-    ds = make_dataset(1, 20)
+    len_ts = 20
+    ds = make_dataset(1, len_ts)
+    enc_len = 5
+    dec_len = 3
 
     trans = transform.Chain(
         trans=[
             transform.AddAgeFeature(
                 target_field=FieldName.TARGET,
                 output_field="age",
-                pred_length=10,
+                pred_length=dec_len,
             ),
             ForkingSequenceSplitter(
                 train_sampler=TSplitSampler(),
-                enc_len=5,
-                dec_len=3,
+                enc_len=enc_len,
+                dec_len=dec_len,
                 encoder_series_fields=["age"],
             ),
         ]
@@ -70,34 +73,19 @@ def test_forking_sequence_splitter() -> None:
             [17.0, 18.0, 19.0],
         ]
     )
-
     assert (
         np.linalg.norm(future_target - transformed_data["future_target"])
         < 1e-5
     ), "the forking sequence target should be computed correctly."
 
-    trans_oob = transform.Chain(
-        trans=[
-            transform.AddAgeFeature(
-                target_field=FieldName.TARGET,
-                output_field="age",
-                pred_length=10,
-            ),
-            ForkingSequenceSplitter(
-                train_sampler=TSplitSampler(),
-                enc_len=20,
-                dec_len=20,
-                encoder_series_fields=["age"],
-            ),
-        ]
-    )
-
-    transformed_data_oob = next(iter(trans_oob(iter(ds), is_train=True)))
-
+    age = np.log10(2.0 + np.arange(len_ts))
     assert (
-        np.sum(transformed_data_oob["future_target"]) - np.sum(np.arange(20))
+        np.linalg.norm(
+            age[-(enc_len + dec_len) : -dec_len]
+            - transformed_data["past_age"].flatten()
+        )
         < 1e-5
-    ), "the forking sequence target should be computed correctly."
+    ), "the forking sequence past feature should be computed correctly."
 
 
 @pytest.mark.parametrize("is_train", [True, False])

From 356efaac4a1826e20571add5efc50e3f97fa56b0 Mon Sep 17 00:00:00 2001
From: Jasper Schulz <schjaspe@amazon.de>
Date: Tue, 25 Feb 2020 15:18:26 +0100
Subject: [PATCH 03/44] Added derive_auto_fields method.

---
 src/gluonts/model/deepar/_estimator.py | 15 +++++++++++++--
 src/gluonts/model/estimator.py         |  9 +++++++++
 src/gluonts/model/predictor.py         |  9 +++++++++
 src/gluonts/shell/train.py             | 12 ++++++------
 4 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/gluonts/model/deepar/_estimator.py b/src/gluonts/model/deepar/_estimator.py
index 7005f49c85..a593eb76bb 100644
--- a/src/gluonts/model/deepar/_estimator.py
+++ b/src/gluonts/model/deepar/_estimator.py
@@ -21,6 +21,7 @@
 # First-party imports
 from gluonts.core.component import DType, validated
 from gluonts.dataset.field_names import FieldName
+from gluonts.dataset.stat import calculate_dataset_statistics
 from gluonts.distribution import DistributionOutput, StudentTOutput
 from gluonts.model.estimator import GluonEstimator
 from gluonts.model.predictor import Predictor, RepresentableBlockPredictor
@@ -146,8 +147,8 @@ def __init__(
         assert num_layers > 0, "The value of `num_layers` should be > 0"
         assert num_cells > 0, "The value of `num_cells` should be > 0"
         assert dropout_rate >= 0, "The value of `dropout_rate` should be >= 0"
-        assert (cardinality is not None and use_feat_static_cat) or (
-            cardinality is None and not use_feat_static_cat
+        assert (cardinality and use_feat_static_cat) or (
+            not (cardinality or use_feat_static_cat)
         ), "You should set `cardinality` if and only if `use_feat_static_cat=True`"
         assert cardinality is None or all(
             [c > 0 for c in cardinality]
@@ -197,6 +198,16 @@ def __init__(
 
         self.num_parallel_samples = num_parallel_samples
 
+    @classmethod
+    def derive_auto_fields(cls, train_iter):
+        stats = calculate_dataset_statistics(train_iter)
+
+        return {
+            "use_feat_dynamic_real": stats.num_feat_dynamic_real > 0,
+            "use_feat_static_cat": bool(stats.feat_static_cat),
+            "cardinality": [len(cats) for cats in stats.feat_static_cat],
+        }
+
     def create_transformation(self) -> Transformation:
         remove_field_names = [FieldName.FEAT_DYNAMIC_CAT]
         if not self.use_feat_static_real:
diff --git a/src/gluonts/model/estimator.py b/src/gluonts/model/estimator.py
index 55e6409b2c..e18d801763 100644
--- a/src/gluonts/model/estimator.py
+++ b/src/gluonts/model/estimator.py
@@ -69,6 +69,15 @@ def train(
     def from_hyperparameters(cls, **hyperparameters):
         return from_hyperparameters(cls, **hyperparameters)
 
+    @classmethod
+    def derive_auto_fields(cls, train_iter):
+        return {}
+
+    @classmethod
+    def from_inputs(cls, train_iter, params):
+        auto_params = cls.derive_auto_fields(train_iter)
+        return cls.from_hyperparameters(**auto_params, **params)
+
 
 class DummyEstimator(Estimator):
     """
diff --git a/src/gluonts/model/predictor.py b/src/gluonts/model/predictor.py
index bfbcd83fa6..61263e848f 100644
--- a/src/gluonts/model/predictor.py
+++ b/src/gluonts/model/predictor.py
@@ -156,6 +156,15 @@ def deserialize(
     def from_hyperparameters(cls, **hyperparameters):
         return from_hyperparameters(cls, **hyperparameters)
 
+    @classmethod
+    def derive_auto_fields(cls, train_iter):
+        return {}
+
+    @classmethod
+    def from_inputs(cls, train_iter, params):
+        auto_params = cls.derive_auto_fields(train_iter)
+        return cls.from_hyperparameters(**auto_params, **params)
+
 
 class RepresentablePredictor(Predictor):
     """
diff --git a/src/gluonts/shell/train.py b/src/gluonts/shell/train.py
index f01f065069..e561c15ba8 100644
--- a/src/gluonts/shell/train.py
+++ b/src/gluonts/shell/train.py
@@ -22,7 +22,6 @@
 from gluonts.core.serde import dump_code
 from gluonts.evaluation import Evaluator, backtest
 from gluonts.dataset.common import Dataset
-from gluonts.dataset.stat import calculate_dataset_statistics
 from gluonts.model.estimator import Estimator
 from gluonts.model.predictor import Predictor
 from gluonts.transform import FilterTransformation, TransformedDataset
@@ -54,13 +53,18 @@ def run_train_and_test(
 ) -> None:
     check_gpu_support()
 
+    # train_stats = calculate_dataset_statistics(env.datasets["train"])
+    # log_metric("train_dataset_stats", train_stats)
+
     forecaster_fq_name = fqname_for(forecaster_type)
     forecaster_version = forecaster_type.__version__
 
     logger.info(f"Using gluonts v{gluonts.__version__}")
     logger.info(f"Using forecaster {forecaster_fq_name} v{forecaster_version}")
 
-    forecaster = forecaster_type.from_hyperparameters(**env.hyperparameters)
+    forecaster = forecaster_type.from_inputs(
+        env.datasets["train"], env.hyperparameters
+    )
 
     logger.info(
         f"The forecaster can be reconstructed with the following expression: "
@@ -90,10 +94,6 @@ def run_train(
     train_dataset: Dataset,
     validation_dataset: Optional[Dataset],
 ) -> Predictor:
-    log_metric(
-        "train_dataset_stats", calculate_dataset_statistics(train_dataset)
-    )
-
     return forecaster.train(train_dataset, validation_dataset)
 
 

From eefce491591ca82159f09997a0547dd933d872ad Mon Sep 17 00:00:00 2001
From: Bernie Wang <yuyawang@amazon.com>
Date: Tue, 25 Feb 2020 16:28:41 +0100
Subject: [PATCH 04/44] add use_dynamic_feat option

---
 .../model/seq2seq/_forking_estimator.py       |  65 +++++--
 src/gluonts/model/seq2seq/_forking_network.py | 160 ++++++++++++++++--
 .../model/seq2seq/_mq_dnn_estimator.py        |   8 +-
 3 files changed, 205 insertions(+), 28 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 9f05f7aefd..1db875730c 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -30,17 +30,19 @@
 from gluonts.transform import (
     AsNumpyArray,
     AddAgeFeature,
+    AddTimeFeatures,
     Chain,
     TestSplitSampler,
     Transformation,
 )
 
 # Relative imports
-# from transform import AddAgeFeature
+from gluonts.time_feature import time_features_from_frequency_str
 from ._forking_network import (
     ForkingSeq2SeqPredictionNetwork,
     ForkingSeq2SeqTrainingNetwork,
-)
+    ForkingSeq2SeqNetwork, ForkingSeq2SeqNetworkBase, ForkingSeq2SeqTargetPredictionNetwork,
+    ForkingSeq2SeqTargetTrainingNetwork)
 from ._transform import ForkingSequenceSplitter
 
 
@@ -93,6 +95,7 @@ def __init__(
         quantile_output: QuantileOutput,
         freq: str,
         prediction_length: int,
+        use_dynamic_feat: bool = False,
         context_length: Optional[int] = None,
         trainer: Trainer = Trainer(),
     ) -> None:
@@ -110,22 +113,32 @@ def __init__(
         self.quantile_output = quantile_output
         self.prediction_length = prediction_length
         self.freq = freq
+        self.use_dynamic_feat = use_dynamic_feat
         self.context_length = (
             context_length if context_length is not None else prediction_length
         )
 
     def create_transformation(self) -> Transformation:
-        return Chain(
+
+        if self.use_dynamic_feat:
+            feat_def = Chain(
             trans=[
                 AsNumpyArray(
                     field=FieldName.TARGET, expected_ndim=1, dtype=self.dtype
                 ),
-                AddAgeFeature(
+                # AddAgeFeature(
+                #     target_field=FieldName.TARGET,
+                #     output_field=FieldName.FEAT_DYNAMIC_REAL,
+                #     log_scale=True,
+                #     pred_length=self.prediction_length,
+                #     dtype=self.dtype,
+                # ),
+                AddTimeFeatures(
+                    start_field=FieldName.START,
                     target_field=FieldName.TARGET,
                     output_field=FieldName.FEAT_DYNAMIC_REAL,
-                    log_scale=True,
+                    time_features= time_features_from_frequency_str(self.freq),
                     pred_length=self.prediction_length,
-                    dtype=self.dtype,
                 ),
                 ForkingSequenceSplitter(
                     train_sampler=TestSplitSampler(),
@@ -133,21 +146,43 @@ def create_transformation(self) -> Transformation:
                     dec_len=self.prediction_length,
                     encoder_series_fields=[FieldName.FEAT_DYNAMIC_REAL],
                 ),
-            ]
-        )
+            ])
+        else:
+            feat_def = Chain(
+            trans=[
+                AsNumpyArray(
+                    field=FieldName.TARGET, expected_ndim=1, dtype=self.dtype
+                ),
+                ForkingSequenceSplitter(
+                    train_sampler=TestSplitSampler(),
+                    enc_len=self.context_length,
+                    dec_len=self.prediction_length,
+                ),
+            ])
+
+        return feat_def
+
+    def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
+        # return ForkingSeq2SeqTrainingNetwork(
+        #     encoder=self.encoder,
+        #     enc2dec=PassThroughEnc2Dec(),
+        #     decoder=self.decoder,
+        #     quantile_output=self.quantile_output,
+        # )
 
-    def create_training_network(self) -> ForkingSeq2SeqTrainingNetwork:
-        return ForkingSeq2SeqTrainingNetwork(
+        return ForkingSeq2SeqNetwork(
             encoder=self.encoder,
             enc2dec=PassThroughEnc2Dec(),
             decoder=self.decoder,
             quantile_output=self.quantile_output,
-        )
+            use_dynamic_real=self.use_dynamic_feat
+        ).get_training_network()
+
 
     def create_predictor(
         self,
         transformation: Transformation,
-        trained_network: ForkingSeq2SeqTrainingNetwork,
+        trained_network: ForkingSeq2SeqNetworkBase,
     ) -> Predictor:
         # todo: this is specific to quantile output
         quantile_strs = [
@@ -155,12 +190,14 @@ def create_predictor(
             for quantile in self.quantile_output.quantiles
         ]
 
-        prediction_network = ForkingSeq2SeqPredictionNetwork(
+
+        prediction_network = ForkingSeq2SeqNetwork(
             encoder=trained_network.encoder,
             enc2dec=trained_network.enc2dec,
             decoder=trained_network.decoder,
             quantile_output=trained_network.quantile_output,
-        )
+            use_dynamic_real=self.use_dynamic_feat
+        ).get_prediction_network()
 
         copy_parameters(trained_network, prediction_network)
 
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 359bc5a9ea..c805f04030 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -59,15 +59,79 @@ def __init__(
         self.decoder = decoder
         self.quantile_output = quantile_output
 
+        self.feat_static_real = nd_None
+        self.past_feat_dynamic_real = nd_None
+        self.future_feat_dynamic_real = nd_None
+
         with self.name_scope():
             self.quantile_proj = quantile_output.get_quantile_proj()
             self.loss = quantile_output.get_loss()
 
+class ForkingSeq2SeqNetwork():
+    @validated()
+    def __init__(
+            self,
+            encoder: Seq2SeqEncoder,
+            enc2dec: Seq2SeqEnc2Dec,
+            decoder: Seq2SeqDecoder,
+            quantile_output: QuantileOutput,
+            use_dynamic_real: bool = False,
+            use_static_cat: bool = False,
+            **kwargs,
+    ) -> None:
+        self.encoder = encoder
+        self.enc2dec = enc2dec
+        self.decoder = decoder
+        self.quantile_output = quantile_output
+
+        self.use_dynamic_real = use_dynamic_real
+        self.use_static_cat = use_static_cat
+
+    def get_training_network(self) -> ForkingSeq2SeqNetworkBase:
+        if self.use_static_cat is False and self.use_dynamic_real is False:
+            return ForkingSeq2SeqTargetTrainingNetwork(
+                encoder=self.encoder,
+                enc2dec=self.enc2dec,
+                decoder=self.decoder,
+                quantile_output=self.quantile_output
+            )
+        elif self.use_static_cat is False and self.use_dynamic_real:
+            return ForkingSeq2SeqTrainingNetwork(
+                encoder=self.encoder,
+                enc2dec=self.enc2dec,
+                decoder=self.decoder,
+                quantile_output=self.quantile_output
+            )
+        else:
+            raise("Not implemented yet!")
+
+    def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
+        if self.use_static_cat is False and self.use_dynamic_real is False:
+            return ForkingSeq2SeqTargetPredictionNetwork(
+                encoder=self.encoder,
+                enc2dec=self.enc2dec,
+                decoder=self.decoder,
+                quantile_output=self.quantile_output
+            )
+        elif self.use_static_cat is False and self.use_dynamic_real:
+            return ForkingSeq2SeqPredictionNetwork(
+                encoder=self.encoder,
+                enc2dec=self.enc2dec,
+                decoder=self.decoder,
+                quantile_output=self.quantile_output
+            )
+        else:
+            raise("Not implemented yet!")
+
+
 
 class ForkingSeq2SeqTrainingNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
     def hybrid_forward(
-        self, F, past_target: Tensor, past_feat_dynamic_real: Tensor, future_target: Tensor
+        self, F,
+            past_target: Tensor,
+            past_feat_dynamic_real: Tensor,
+            future_target: Tensor
     ) -> Tensor:
         """
         Parameters
@@ -88,17 +152,13 @@ def hybrid_forward(
         # print(f"past_feat_dynamic_real: {past_feat_dynamic_real.shape}")
         # print(f"future_target: {future_target.shape}")
 
-        # FIXME: can we factor out a common prefix in the base network?
-        feat_static_real = nd_None
-        # past_feat_dynamic_real = nd_None
-        future_feat_dynamic_real = nd_None
 
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
+            past_target, self.feat_static_real, past_feat_dynamic_real
         )
 
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
+            enc_output_static, enc_output_dynamic, self.future_feat_dynamic_real
         )
 
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
@@ -112,7 +172,10 @@ def hybrid_forward(
 
 class ForkingSeq2SeqPredictionNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
-    def hybrid_forward(self, F, past_target: Tensor, past_feat_dynamic_real: Tensor) -> Tensor:
+    def hybrid_forward(self,
+                       F,
+                       past_target: Tensor,
+                       past_feat_dynamic_real: Tensor) -> Tensor:
         """
         Parameters
         ----------
@@ -127,12 +190,9 @@ def hybrid_forward(self, F, past_target: Tensor, past_feat_dynamic_real: Tensor)
         """
 
         # FIXME: can we factor out a common prefix in the base network?
-        feat_static_real = nd_None
-        # past_feat_dynamic_real = nd_None
-        future_feat_dynamic_real = nd_None
 
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
+            past_target, self.feat_static_real, past_feat_dynamic_real
         )
 
         enc_output_static = (
@@ -140,7 +200,7 @@ def hybrid_forward(self, F, past_target: Tensor, past_feat_dynamic_real: Tensor)
         )
 
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
+            enc_output_static, enc_output_dynamic, self.future_feat_dynamic_real
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
@@ -149,3 +209,77 @@ def hybrid_forward(self, F, past_target: Tensor, past_feat_dynamic_real: Tensor)
 
         predictions = self.quantile_proj(fcst_output).swapaxes(2, 1)
         return predictions
+
+
+class ForkingSeq2SeqTargetTrainingNetwork(ForkingSeq2SeqNetworkBase):
+    # noinspection PyMethodOverriding
+    def hybrid_forward(
+        self, F, past_target: Tensor, future_target: Tensor
+    ) -> Tensor:
+        """
+        Parameters
+        ----------
+        F: mx.symbol or mx.ndarray
+            Gluon function space
+        past_target: Tensor
+            FIXME
+        future_target: Tensor
+            shape (num_ts, encoder_length, 1) FIXME
+
+        Returns
+        -------
+        loss with shape (FIXME, FIXME)
+        """
+
+        enc_output_static, enc_output_dynamic = self.encoder(
+            past_target, self.feat_static_real, self.past_feat_dynamic_real
+        )
+
+        dec_input_static, dec_input_dynamic, _ = self.enc2dec(
+            enc_output_static, enc_output_dynamic, self.future_feat_dynamic_real
+        )
+
+        dec_output = self.decoder(dec_input_dynamic, dec_input_static)
+        dec_dist_output = self.quantile_proj(dec_output)
+
+        loss = self.loss(future_target, dec_dist_output)
+        return loss.mean(axis=1)
+
+
+class ForkingSeq2SeqTargetPredictionNetwork(ForkingSeq2SeqNetworkBase):
+    # noinspection PyMethodOverriding
+    def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
+        """
+        Parameters
+        ----------
+        F: mx.symbol or mx.ndarray
+            Gluon function space
+        past_target: Tensor
+            FIXME
+
+        Returns
+        -------
+        prediction tensor with shape (FIXME, FIXME)
+        """
+
+        # FIXME: can we factor out a common prefix in the base network?
+
+
+        enc_output_static, enc_output_dynamic = self.encoder(
+            past_target, self.feat_static_real, self.past_feat_dynamic_real
+        )
+
+        enc_output_static = (
+            nd_None if enc_output_static is None else enc_output_static
+        )
+
+        dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
+            enc_output_static, enc_output_dynamic, self.future_feat_dynamic_real
+        )
+
+        dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
+        fcst_output = F.slice_axis(dec_output, axis=1, begin=-1, end=None)
+        fcst_output = F.squeeze(fcst_output, axis=1)
+
+        predictions = self.quantile_proj(fcst_output).swapaxes(2, 1)
+        return predictions
\ No newline at end of file
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index c89854dca1..7dcac51a20 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -48,6 +48,7 @@ def __init__(
         context_length: Optional[int],
         prediction_length: int,
         freq: str,
+        use_dynamic_feat: bool = False,
         decoder_mlp_dim_seq: List[int] = [20],
         quantiles: List[float] = list(),
         trainer: Trainer = Trainer(),
@@ -73,6 +74,7 @@ def __init__(
             decoder=decoder,
             quantile_output=quantile_output,
             freq=freq,
+            use_dynamic_feat=use_dynamic_feat,
             prediction_length=prediction_length,
             context_length=context_length,
             trainer=trainer,
@@ -91,6 +93,7 @@ def __init__(
         prediction_length: int,
         freq: str,
         context_length: Optional[int] = None,
+        use_dynamic_feat: bool = False,
         seed: Optional[int] = None,
         decoder_mlp_dim_seq: List[int] = [20],
         channels_seq: List[int] = [30, 30, 30],
@@ -119,11 +122,13 @@ def __init__(
             kernel_size_seq=channels_seq,
             channels_seq=kernel_size_seq,
             use_residual=use_residual,
-            use_dynamic_feat=True,
+            use_dynamic_feat=use_dynamic_feat,
             prefix="encoder_",
         )
+
         super(MQCNNEstimator, self).__init__(
             encoder=encoder,
+            use_dynamic_feat=use_dynamic_feat,
             decoder_mlp_dim_seq=decoder_mlp_dim_seq,
             freq=freq,
             prediction_length=prediction_length,
@@ -182,6 +187,7 @@ def __init__(
 
     for _ in range(1):
         estimator = MQCNNEstimator(
+            use_dynamic_feat=True,
             prediction_length=dataset.metadata.prediction_length,
             seed=42,
             freq=dataset.metadata.freq,

From 67d94ebe846cad15de8fdd401cd2227e63ae6493 Mon Sep 17 00:00:00 2001
From: Jasper Schulz <schjaspe@amazon.de>
Date: Tue, 25 Feb 2020 16:49:56 +0100
Subject: [PATCH 05/44] Added checks for dyn features.

---
 src/gluonts/block/encoder.py                  |  6 +-
 .../model/seq2seq/_forking_estimator.py       | 84 ++++++++++---------
 src/gluonts/model/seq2seq/_forking_network.py | 68 ++++++++-------
 3 files changed, 83 insertions(+), 75 deletions(-)

diff --git a/src/gluonts/block/encoder.py b/src/gluonts/block/encoder.py
index 2a3e4ee3a6..9856097601 100644
--- a/src/gluonts/block/encoder.py
+++ b/src/gluonts/block/encoder.py
@@ -233,9 +233,7 @@ def hybrid_forward(
                 dynamic_features=dynamic_features,
             )
         elif self.use_dynamic_feat:
-            inputs = F.concat(
-            target, dynamic_features, dim=2
-        )  # (N, T, C)
+            inputs = F.concat(target, dynamic_features, dim=2)  # (N, T, C)
         else:
             inputs = target
 
@@ -478,4 +476,4 @@ def hybrid_forward(
             F.slice_axis(dynamic_code, axis=1, begin=-1, end=None), axis=1
         )
 
-        return static_code, dynamic_code
\ No newline at end of file
+        return static_code, dynamic_code
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 1db875730c..a858030b10 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -34,6 +34,7 @@
     Chain,
     TestSplitSampler,
     Transformation,
+    VstackFeatures,
 )
 
 # Relative imports
@@ -41,8 +42,11 @@
 from ._forking_network import (
     ForkingSeq2SeqPredictionNetwork,
     ForkingSeq2SeqTrainingNetwork,
-    ForkingSeq2SeqNetwork, ForkingSeq2SeqNetworkBase, ForkingSeq2SeqTargetPredictionNetwork,
-    ForkingSeq2SeqTargetTrainingNetwork)
+    ForkingSeq2SeqNetwork,
+    ForkingSeq2SeqNetworkBase,
+    ForkingSeq2SeqTargetPredictionNetwork,
+    ForkingSeq2SeqTargetTrainingNetwork,
+)
 from ._transform import ForkingSequenceSplitter
 
 
@@ -117,50 +121,52 @@ def __init__(
         self.context_length = (
             context_length if context_length is not None else prediction_length
         )
+        self.add_time_feature = True
+
+    @classmethod
+    def derive_auto_fields(cls, train_iter):
+        return {}
 
     def create_transformation(self) -> Transformation:
+        chain = []
+        dynamic_feat_fields = []
 
-        if self.use_dynamic_feat:
-            feat_def = Chain(
-            trans=[
-                AsNumpyArray(
-                    field=FieldName.TARGET, expected_ndim=1, dtype=self.dtype
-                ),
-                # AddAgeFeature(
-                #     target_field=FieldName.TARGET,
-                #     output_field=FieldName.FEAT_DYNAMIC_REAL,
-                #     log_scale=True,
-                #     pred_length=self.prediction_length,
-                #     dtype=self.dtype,
-                # ),
+        if self.add_time_feature:
+            chain.append(
                 AddTimeFeatures(
                     start_field=FieldName.START,
                     target_field=FieldName.TARGET,
-                    output_field=FieldName.FEAT_DYNAMIC_REAL,
-                    time_features= time_features_from_frequency_str(self.freq),
+                    output_field="time_feature",
+                    time_features=time_features_from_frequency_str(self.freq),
                     pred_length=self.prediction_length,
                 ),
-                ForkingSequenceSplitter(
-                    train_sampler=TestSplitSampler(),
-                    enc_len=self.context_length,
-                    dec_len=self.prediction_length,
-                    encoder_series_fields=[FieldName.FEAT_DYNAMIC_REAL],
-                ),
-            ])
+            )
+            dynamic_feat_fields.append("time_feature")
+
+        if self.use_dynamic_feat:
+            dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
+
+        if dynamic_feat_fields:
+            chain.append(
+                VstackFeatures(
+                    output_field=FieldName.FEAT_TIME,
+                    input_fields=dynamic_feat_fields,
+                )
+            )
+            output_field = [FieldName.FEAT_TIME]
         else:
-            feat_def = Chain(
-            trans=[
-                AsNumpyArray(
-                    field=FieldName.TARGET, expected_ndim=1, dtype=self.dtype
-                ),
-                ForkingSequenceSplitter(
-                    train_sampler=TestSplitSampler(),
-                    enc_len=self.context_length,
-                    dec_len=self.prediction_length,
-                ),
-            ])
+            output_field = []
+
+        chain.append(
+            ForkingSequenceSplitter(
+                train_sampler=TestSplitSampler(),
+                enc_len=self.context_length,
+                dec_len=self.prediction_length,
+                encoder_series_fields=output_field,
+            ),
+        )
 
-        return feat_def
+        return Chain(chain)
 
     def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
         # return ForkingSeq2SeqTrainingNetwork(
@@ -175,10 +181,9 @@ def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
             enc2dec=PassThroughEnc2Dec(),
             decoder=self.decoder,
             quantile_output=self.quantile_output,
-            use_dynamic_real=self.use_dynamic_feat
+            use_dynamic_real=self.use_dynamic_feat,
         ).get_training_network()
 
-
     def create_predictor(
         self,
         transformation: Transformation,
@@ -190,13 +195,12 @@ def create_predictor(
             for quantile in self.quantile_output.quantiles
         ]
 
-
         prediction_network = ForkingSeq2SeqNetwork(
             encoder=trained_network.encoder,
             enc2dec=trained_network.enc2dec,
             decoder=trained_network.decoder,
             quantile_output=trained_network.quantile_output,
-            use_dynamic_real=self.use_dynamic_feat
+            use_dynamic_real=self.use_dynamic_feat,
         ).get_prediction_network()
 
         copy_parameters(trained_network, prediction_network)
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index c805f04030..e2b5bfee29 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -67,17 +67,18 @@ def __init__(
             self.quantile_proj = quantile_output.get_quantile_proj()
             self.loss = quantile_output.get_loss()
 
-class ForkingSeq2SeqNetwork():
+
+class ForkingSeq2SeqNetwork:
     @validated()
     def __init__(
-            self,
-            encoder: Seq2SeqEncoder,
-            enc2dec: Seq2SeqEnc2Dec,
-            decoder: Seq2SeqDecoder,
-            quantile_output: QuantileOutput,
-            use_dynamic_real: bool = False,
-            use_static_cat: bool = False,
-            **kwargs,
+        self,
+        encoder: Seq2SeqEncoder,
+        enc2dec: Seq2SeqEnc2Dec,
+        decoder: Seq2SeqDecoder,
+        quantile_output: QuantileOutput,
+        use_dynamic_real: bool = False,
+        use_static_cat: bool = False,
+        **kwargs,
     ) -> None:
         self.encoder = encoder
         self.enc2dec = enc2dec
@@ -93,17 +94,17 @@ def get_training_network(self) -> ForkingSeq2SeqNetworkBase:
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
                 decoder=self.decoder,
-                quantile_output=self.quantile_output
+                quantile_output=self.quantile_output,
             )
         elif self.use_static_cat is False and self.use_dynamic_real:
             return ForkingSeq2SeqTrainingNetwork(
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
                 decoder=self.decoder,
-                quantile_output=self.quantile_output
+                quantile_output=self.quantile_output,
             )
         else:
-            raise("Not implemented yet!")
+            raise ("Not implemented yet!")
 
     def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
         if self.use_static_cat is False and self.use_dynamic_real is False:
@@ -111,27 +112,27 @@ def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
                 decoder=self.decoder,
-                quantile_output=self.quantile_output
+                quantile_output=self.quantile_output,
             )
         elif self.use_static_cat is False and self.use_dynamic_real:
             return ForkingSeq2SeqPredictionNetwork(
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
                 decoder=self.decoder,
-                quantile_output=self.quantile_output
+                quantile_output=self.quantile_output,
             )
         else:
-            raise("Not implemented yet!")
-
+            raise ("Not implemented yet!")
 
 
 class ForkingSeq2SeqTrainingNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
     def hybrid_forward(
-        self, F,
-            past_target: Tensor,
-            past_feat_dynamic_real: Tensor,
-            future_target: Tensor
+        self,
+        F,
+        past_target: Tensor,
+        past_feat_dynamic_real: Tensor,
+        future_target: Tensor,
     ) -> Tensor:
         """
         Parameters
@@ -152,13 +153,14 @@ def hybrid_forward(
         # print(f"past_feat_dynamic_real: {past_feat_dynamic_real.shape}")
         # print(f"future_target: {future_target.shape}")
 
-
         enc_output_static, enc_output_dynamic = self.encoder(
             past_target, self.feat_static_real, past_feat_dynamic_real
         )
 
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, self.future_feat_dynamic_real
+            enc_output_static,
+            enc_output_dynamic,
+            self.future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
@@ -172,10 +174,9 @@ def hybrid_forward(
 
 class ForkingSeq2SeqPredictionNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
-    def hybrid_forward(self,
-                       F,
-                       past_target: Tensor,
-                       past_feat_dynamic_real: Tensor) -> Tensor:
+    def hybrid_forward(
+        self, F, past_target: Tensor, past_feat_dynamic_real: Tensor
+    ) -> Tensor:
         """
         Parameters
         ----------
@@ -200,7 +201,9 @@ def hybrid_forward(self,
         )
 
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, self.future_feat_dynamic_real
+            enc_output_static,
+            enc_output_dynamic,
+            self.future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
@@ -236,7 +239,9 @@ def hybrid_forward(
         )
 
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, self.future_feat_dynamic_real
+            enc_output_static,
+            enc_output_dynamic,
+            self.future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
@@ -264,7 +269,6 @@ def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
 
         # FIXME: can we factor out a common prefix in the base network?
 
-
         enc_output_static, enc_output_dynamic = self.encoder(
             past_target, self.feat_static_real, self.past_feat_dynamic_real
         )
@@ -274,7 +278,9 @@ def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
         )
 
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, self.future_feat_dynamic_real
+            enc_output_static,
+            enc_output_dynamic,
+            self.future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
@@ -282,4 +288,4 @@ def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
         fcst_output = F.squeeze(fcst_output, axis=1)
 
         predictions = self.quantile_proj(fcst_output).swapaxes(2, 1)
-        return predictions
\ No newline at end of file
+        return predictions

From 2e8dc066306ff0dde554487f6a40cf92a255ab7d Mon Sep 17 00:00:00 2001
From: Jasper Schulz <schjaspe@amazon.de>
Date: Tue, 25 Feb 2020 17:18:26 +0100
Subject: [PATCH 06/44] Fix from_hyperparameters for GluonEstimator.

---
 src/gluonts/model/estimator.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gluonts/model/estimator.py b/src/gluonts/model/estimator.py
index e18d801763..bf38b0b19e 100644
--- a/src/gluonts/model/estimator.py
+++ b/src/gluonts/model/estimator.py
@@ -135,7 +135,10 @@ def from_hyperparameters(cls, **hyperparameters) -> "GluonEstimator":
             )
 
         try:
-            trainer = from_hyperparameters(Trainer, **hyperparameters)
+            trainer = hyperparameters.get("trainer")
+            if not isinstance(trainer, Trainer):
+                trainer = from_hyperparameters(Trainer, **hyperparameters)
+
             return cls(
                 **Model(**{**hyperparameters, "trainer": trainer}).__dict__
             )

From 70f3a255c4f93ca204c86fdd7c38b578a4ddc51e Mon Sep 17 00:00:00 2001
From: Bernie Wang <yuyawang@amazon.com>
Date: Tue, 25 Feb 2020 18:14:52 +0100
Subject: [PATCH 07/44] enable date and age features, and rts

---
 src/gluonts/model/estimator.py                |  2 +-
 .../model/seq2seq/_forking_estimator.py       | 51 ++++++++++++++-----
 src/gluonts/model/seq2seq/_forking_network.py |  4 +-
 .../model/seq2seq/_mq_dnn_estimator.py        | 40 ++++++++++++---
 src/gluonts/model/seq2seq/_transform.py       |  4 +-
 src/gluonts/shell/train.py                    |  2 +-
 6 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/src/gluonts/model/estimator.py b/src/gluonts/model/estimator.py
index bf38b0b19e..3a5784fce6 100644
--- a/src/gluonts/model/estimator.py
+++ b/src/gluonts/model/estimator.py
@@ -74,7 +74,7 @@ def derive_auto_fields(cls, train_iter):
         return {}
 
     @classmethod
-    def from_inputs(cls, train_iter, params):
+    def from_inputs(cls, train_iter, **params):
         auto_params = cls.derive_auto_fields(train_iter)
         return cls.from_hyperparameters(**auto_params, **params)
 
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index a858030b10..77e49e0f47 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -35,6 +35,7 @@
     TestSplitSampler,
     Transformation,
     VstackFeatures,
+    RenameFields,
 )
 
 # Relative imports
@@ -100,6 +101,8 @@ def __init__(
         freq: str,
         prediction_length: int,
         use_dynamic_feat: bool = False,
+        add_time_feature: bool = False,
+        add_age_feature: bool = False,
         context_length: Optional[int] = None,
         trainer: Trainer = Trainer(),
     ) -> None:
@@ -121,11 +124,14 @@ def __init__(
         self.context_length = (
             context_length if context_length is not None else prediction_length
         )
-        self.add_time_feature = True
+        self.add_time_feature = add_time_feature
+        self.add_age_feature = add_age_feature
 
-    @classmethod
-    def derive_auto_fields(cls, train_iter):
-        return {}
+        # is target only network or not?
+        self.dynamic_network = (
+            use_dynamic_feat or add_time_feature or add_age_feature
+        )
+        print(f"use_dynamic_network: {self.dynamic_network}")
 
     def create_transformation(self) -> Transformation:
         chain = []
@@ -136,33 +142,50 @@ def create_transformation(self) -> Transformation:
                 AddTimeFeatures(
                     start_field=FieldName.START,
                     target_field=FieldName.TARGET,
-                    output_field="time_feature",
+                    output_field=FieldName.FEAT_TIME,
                     time_features=time_features_from_frequency_str(self.freq),
                     pred_length=self.prediction_length,
                 ),
             )
-            dynamic_feat_fields.append("time_feature")
+            dynamic_feat_fields.append(FieldName.FEAT_TIME)
+
+        if self.add_age_feature:
+            chain.append(
+                AddAgeFeature(
+                    target_field=FieldName.TARGET,
+                    output_field=FieldName.FEAT_AGE,
+                    pred_length=self.prediction_length,
+                ),
+            )
+            dynamic_feat_fields.append(FieldName.FEAT_AGE)
 
         if self.use_dynamic_feat:
             dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
 
-        if dynamic_feat_fields:
+        if len(dynamic_feat_fields) > 1:
             chain.append(
                 VstackFeatures(
-                    output_field=FieldName.FEAT_TIME,
+                    output_field=FieldName.FEAT_DYNAMIC_REAL,
                     input_fields=dynamic_feat_fields,
                 )
             )
-            output_field = [FieldName.FEAT_TIME]
-        else:
-            output_field = []
+        elif len(dynamic_feat_fields) == 1:
+            chain.append(
+                RenameFields(
+                    {dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL}
+                )
+            )
+
+        decoder_field = (
+            [FieldName.FEAT_DYNAMIC_REAL] if dynamic_feat_fields else []
+        )
 
         chain.append(
             ForkingSequenceSplitter(
                 train_sampler=TestSplitSampler(),
                 enc_len=self.context_length,
                 dec_len=self.prediction_length,
-                encoder_series_fields=output_field,
+                encoder_series_fields=decoder_field,
             ),
         )
 
@@ -181,7 +204,7 @@ def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
             enc2dec=PassThroughEnc2Dec(),
             decoder=self.decoder,
             quantile_output=self.quantile_output,
-            use_dynamic_real=self.use_dynamic_feat,
+            use_dynamic_real=self.dynamic_network,
         ).get_training_network()
 
     def create_predictor(
@@ -200,7 +223,7 @@ def create_predictor(
             enc2dec=trained_network.enc2dec,
             decoder=trained_network.decoder,
             quantile_output=trained_network.quantile_output,
-            use_dynamic_real=self.use_dynamic_feat,
+            use_dynamic_real=self.dynamic_network,
         ).get_prediction_network()
 
         copy_parameters(trained_network, prediction_network)
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index e2b5bfee29..a8a0d53970 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -104,7 +104,7 @@ def get_training_network(self) -> ForkingSeq2SeqNetworkBase:
                 quantile_output=self.quantile_output,
             )
         else:
-            raise ("Not implemented yet!")
+            raise NotImplementedError
 
     def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
         if self.use_static_cat is False and self.use_dynamic_real is False:
@@ -122,7 +122,7 @@ def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
                 quantile_output=self.quantile_output,
             )
         else:
-            raise ("Not implemented yet!")
+            raise NotImplementedError
 
 
 class ForkingSeq2SeqTrainingNetwork(ForkingSeq2SeqNetworkBase):
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 7dcac51a20..1c905a39cb 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -15,6 +15,7 @@
 from typing import List, Optional
 
 # First-party imports
+from dataset.stat import calculate_dataset_statistics
 from gluonts.evaluation.backtest import make_evaluation_predictions
 from gluonts.block.decoder import ForkingMLPDecoder
 from gluonts.block.encoder import (
@@ -49,6 +50,8 @@ def __init__(
         prediction_length: int,
         freq: str,
         use_dynamic_feat: bool = False,
+        add_time_feature: bool = False,
+        add_age_feature: bool = False,
         decoder_mlp_dim_seq: List[int] = [20],
         quantiles: List[float] = list(),
         trainer: Trainer = Trainer(),
@@ -75,6 +78,8 @@ def __init__(
             quantile_output=quantile_output,
             freq=freq,
             use_dynamic_feat=use_dynamic_feat,
+            add_age_feature=add_age_feature,
+            add_time_feature=add_time_feature,
             prediction_length=prediction_length,
             context_length=context_length,
             trainer=trainer,
@@ -93,7 +98,9 @@ def __init__(
         prediction_length: int,
         freq: str,
         context_length: Optional[int] = None,
-        use_dynamic_feat: bool = False,
+        use_feat_dynamic_real: bool = False,
+        add_time_feature: bool = False,
+        add_age_feature: bool = False,
         seed: Optional[int] = None,
         decoder_mlp_dim_seq: List[int] = [20],
         channels_seq: List[int] = [30, 30, 30],
@@ -106,6 +113,11 @@ def __init__(
         trainer: Trainer = Trainer(),
     ) -> None:
 
+        use_dynamic_feat_cnn = False
+
+        if use_feat_dynamic_real or add_age_feature or add_time_feature:
+            use_dynamic_feat_cnn = True
+
         if seed:
             np.random.seed(seed)
             mx.random.seed(seed)
@@ -122,13 +134,15 @@ def __init__(
             kernel_size_seq=channels_seq,
             channels_seq=kernel_size_seq,
             use_residual=use_residual,
-            use_dynamic_feat=use_dynamic_feat,
+            use_dynamic_feat=use_dynamic_feat_cnn,
             prefix="encoder_",
         )
 
         super(MQCNNEstimator, self).__init__(
             encoder=encoder,
-            use_dynamic_feat=use_dynamic_feat,
+            use_dynamic_feat=use_feat_dynamic_real,
+            add_time_feature=add_time_feature,
+            add_age_feature=add_age_feature,
             decoder_mlp_dim_seq=decoder_mlp_dim_seq,
             freq=freq,
             prediction_length=prediction_length,
@@ -137,6 +151,16 @@ def __init__(
             quantiles=quantiles,
         )
 
+    @classmethod
+    def derive_auto_fields(cls, train_iter):
+        stats = calculate_dataset_statistics(train_iter)
+
+        return {
+            "use_feat_dynamic_real": stats.num_feat_dynamic_real > 0,
+            # "use_feat_static_cat": bool(stats.feat_static_cat),
+            # "cardinality": [len(cats) for cats in stats.feat_static_cat],
+        }
+
 
 class MQRNNEstimator(MQDNNEstimator):
     """
@@ -186,8 +210,10 @@ def __init__(
     metrics = []
 
     for _ in range(1):
-        estimator = MQCNNEstimator(
-            use_dynamic_feat=True,
+        estimator = MQCNNEstimator.from_inputs(
+            dataset.train,
+            # add_time_feature=True,
+            # add_age_feature=True,
             prediction_length=dataset.metadata.prediction_length,
             seed=42,
             freq=dataset.metadata.freq,
@@ -199,12 +225,14 @@ def __init__(
 
         predictor = estimator.train(dataset.train)
 
+        assert dataset.test is not None
+
         forecast_it, ts_it = make_evaluation_predictions(
             dataset.test, predictor=predictor, num_samples=100
         )
 
         agg_metrics, item_metrics = Evaluator()(
-            ts_it, forecast_it, num_series=len(dataset.test)
+            ts_it, forecast_it, num_series=len(dataset.test)  # type: ignore
         )
 
         metrics.append(agg_metrics["wQuantileLoss[0.5]"])
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index dc0ff84664..72df2a0d8e 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -13,7 +13,7 @@
 
 # Standard library imports
 from collections import Counter
-from typing import Iterator, List
+from typing import Iterator, List, Any
 
 # Third-party imports
 import numpy as np
@@ -126,7 +126,7 @@ def flatmap_transform(
                     continue
 
                 if ts_field in decoder_fields:
-                    d3 = () if ts_field == self.target_in else (len(ts),)
+                    d3: Any = () if ts_field == self.target_in else (len(ts),)
                     forking_dec_field = np.zeros(
                         shape=(self.enc_len, self.dec_len) + d3
                     )
diff --git a/src/gluonts/shell/train.py b/src/gluonts/shell/train.py
index e561c15ba8..96e090681e 100644
--- a/src/gluonts/shell/train.py
+++ b/src/gluonts/shell/train.py
@@ -63,7 +63,7 @@ def run_train_and_test(
     logger.info(f"Using forecaster {forecaster_fq_name} v{forecaster_version}")
 
     forecaster = forecaster_type.from_inputs(
-        env.datasets["train"], env.hyperparameters
+        env.datasets["train"], **env.hyperparameters
     )
 
     logger.info(

From 1f66ac1715881f8e225cf7a4e5684b8f3da251f1 Mon Sep 17 00:00:00 2001
From: Jasper Schulz <schjaspe@amazon.de>
Date: Wed, 26 Feb 2020 11:53:56 +0100
Subject: [PATCH 08/44] Fixup.

---
 src/gluonts/model/seq2seq/_mq_dnn_estimator.py | 2 +-
 test/model/seq2seq/test_encoders.py            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 1c905a39cb..362b89a073 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -15,7 +15,7 @@
 from typing import List, Optional
 
 # First-party imports
-from dataset.stat import calculate_dataset_statistics
+from gluonts.dataset.stat import calculate_dataset_statistics
 from gluonts.evaluation.backtest import make_evaluation_predictions
 from gluonts.block.decoder import ForkingMLPDecoder
 from gluonts.block.encoder import (
diff --git a/test/model/seq2seq/test_encoders.py b/test/model/seq2seq/test_encoders.py
index 96063864de..2fa9f35e44 100644
--- a/test/model/seq2seq/test_encoders.py
+++ b/test/model/seq2seq/test_encoders.py
@@ -38,7 +38,7 @@ def test_hierarchical_cnn_encoders(use_residual, hybridize) -> None:
     dial_seq = [1, 3, 9]
 
     cnn = HierarchicalCausalConv1DEncoder(
-        dial_seq, ks_seq, chl_dim, use_residual, use_dynamic_feat=True
+        dial_seq, ks_seq, chl_dim, use_residual, use_dynamic_feat=True, use_static_feat=True,
     )
     cnn.collect_params().initialize()
 
@@ -47,4 +47,4 @@ def test_hierarchical_cnn_encoders(use_residual, hybridize) -> None:
 
     true_shape = (num_ts, ts_len, 31) if use_residual else (num_ts, ts_len, 30)
 
-    assert cnn(test_data, test_static_feat, test_dynamic_feat)[1].shape == true_shape
\ No newline at end of file
+    assert cnn(test_data, test_static_feat, test_dynamic_feat)[1].shape == true_shape

From de08785234e0918e7cbcf7fc32a41a77209e453b Mon Sep 17 00:00:00 2001
From: Jasper Schulz <schjaspe@amazon.de>
Date: Tue, 31 Mar 2020 10:41:37 +0200
Subject: [PATCH 09/44] xx

---
 src/gluonts/model/seq2seq/_forking_network.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index f4f962eb7a..0d55a85ce8 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -58,9 +58,9 @@ def __init__(
         self.decoder = decoder
         self.quantile_output = quantile_output
 
-        self.feat_static_real = nd_None
-        self.past_feat_dynamic_real = nd_None
-        self.future_feat_dynamic_real = nd_None
+        # self.feat_static_real = F.zeros(shape=(1,))
+        # self.past_feat_dynamic_real = F.zeros(shape=(1,))
+        # self.future_feat_dynamic_real = F.zeros(shape=(1,))
 
         with self.name_scope():
             self.quantile_proj = quantile_output.get_quantile_proj()

From 2a6d737fc3eec57ace426c3e5a8508c9986cef93 Mon Sep 17 00:00:00 2001
From: Jasper Schulz <schjaspe@amazon.de>
Date: Tue, 31 Mar 2020 17:58:22 +0200
Subject: [PATCH 10/44] Fixup.

---
 src/gluonts/block/encoder.py                  | 11 ++----
 src/gluonts/model/seq2seq/_forking_network.py | 34 +++++++++----------
 src/gluonts/model/seq2seq/_transform.py       |  2 +-
 3 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/src/gluonts/block/encoder.py b/src/gluonts/block/encoder.py
index 9856097601..f69cc696f2 100644
--- a/src/gluonts/block/encoder.py
+++ b/src/gluonts/block/encoder.py
@@ -32,10 +32,6 @@ class Seq2SeqEncoder(nn.HybridBlock):
     a dynamic latent code with the same length as the `target` sequence.
     """
 
-    @validated()
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     # noinspection PyMethodOverriding
     def hybrid_forward(
         self,
@@ -77,7 +73,6 @@ def hybrid_forward(
         """
         raise NotImplementedError
 
-    @staticmethod
     def _assemble_inputs(
         F, target: Tensor, static_features: Tensor, dynamic_features: Tensor
     ) -> Tensor:
@@ -226,7 +221,7 @@ def hybrid_forward(
         """
 
         if self.use_dynamic_feat and self.use_static_feat:
-            inputs = Seq2SeqEncoder._assemble_inputs(
+            inputs = self._assemble_inputs(
                 F,
                 target=target,
                 static_features=static_features,
@@ -383,7 +378,7 @@ def hybrid_forward(
             shape (batch_size, sequence_length, num_dynamic_features)
         """
 
-        inputs = Seq2SeqEncoder._assemble_inputs(
+        inputs = self._assemble_inputs(
             F, target, static_features, dynamic_features
         )
         static_code = self.model(inputs)
@@ -465,7 +460,7 @@ def hybrid_forward(
             dynamic code,
             shape (batch_size, sequence_length, num_dynamic_features)
         """
-        inputs = Seq2SeqEncoder._assemble_inputs(
+        inputs = self._assemble_inputs(
             F, target, static_features, dynamic_features
         )
         dynamic_code = self.rnn(inputs)
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 0d55a85ce8..b497a1d482 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -106,7 +106,7 @@ def get_training_network(self) -> ForkingSeq2SeqNetworkBase:
             raise NotImplementedError
 
     def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
-        if self.use_static_cat is False and self.use_dynamic_real is False:
+        if not self.use_static_cat and not self.use_dynamic_real:
             return ForkingSeq2SeqTargetPredictionNetwork(
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
@@ -153,15 +153,12 @@ def hybrid_forward(
         past_feat_dynamic_real = F.zeros(shape=(1,))
         future_feat_dynamic_real = F.zeros(shape=(1,))
 
-
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, self.feat_static_real, past_feat_dynamic_real
+            past_target, feat_static_real, past_feat_dynamic_real
         )
 
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static,
-            enc_output_dynamic,
-            self.future_feat_dynamic_real,
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
         )
 
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
@@ -193,8 +190,11 @@ def hybrid_forward(
 
         # FIXME: can we factor out a common prefix in the base network?
 
+        feat_static_real = F.zeros(shape=(1,))
+        future_feat_dynamic_real = F.zeros(shape=(1,))
+
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, self.feat_static_real, past_feat_dynamic_real
+            past_target, feat_static_real, past_feat_dynamic_real
         )
 
         enc_output_static = (
@@ -202,9 +202,7 @@ def hybrid_forward(
         )
 
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static,
-            enc_output_dynamic,
-            self.future_feat_dynamic_real,
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
@@ -235,14 +233,16 @@ def hybrid_forward(
         loss with shape (FIXME, FIXME)
         """
 
+        feat_static_real = F.zeros(shape=(1,))
+        past_feat_dynamic_real = F.zeros(shape=(1,))
+        future_feat_dynamic_real = F.zeros(shape=(1,))
+
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, self.feat_static_real, self.past_feat_dynamic_real
+            past_target, feat_static_real, past_feat_dynamic_real
         )
 
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static,
-            enc_output_dynamic,
-            self.future_feat_dynamic_real,
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
         )
 
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
@@ -274,7 +274,7 @@ def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
         future_feat_dynamic_real = F.zeros(shape=(1,))
 
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, self.feat_static_real, self.past_feat_dynamic_real
+            past_target, feat_static_real, past_feat_dynamic_real
         )
 
         enc_output_static = (
@@ -284,9 +284,7 @@ def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
         )
 
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static,
-            enc_output_dynamic,
-            self.future_feat_dynamic_real,
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index 72df2a0d8e..d8f3564a63 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -130,8 +130,8 @@ def flatmap_transform(
                     forking_dec_field = np.zeros(
                         shape=(self.enc_len, self.dec_len) + d3
                     )
+                    skip = max(0, self.enc_len - sampling_idx)
 
-                    skip = max(0, self.enc_len - 1 - sampling_idx)
                     for dec_field, idx in zip(
                         forking_dec_field[skip:],
                         range(start_idx + 1, start_idx + self.enc_len + 1),

From 592eb673bb76a4362f2a5b450e5fb4765050d683 Mon Sep 17 00:00:00 2001
From: Jasper Schulz <schjaspe@amazon.de>
Date: Wed, 1 Apr 2020 17:38:02 +0200
Subject: [PATCH 11/44] Another fixup.

---
 src/gluonts/block/encoder.py                         | 6 +++++-
 test/model/seq2seq/test_forking_sequence_splitter.py | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/gluonts/block/encoder.py b/src/gluonts/block/encoder.py
index f69cc696f2..77d7ea4f68 100644
--- a/src/gluonts/block/encoder.py
+++ b/src/gluonts/block/encoder.py
@@ -74,7 +74,11 @@ def hybrid_forward(
         raise NotImplementedError
 
     def _assemble_inputs(
-        F, target: Tensor, static_features: Tensor, dynamic_features: Tensor
+        self,
+        F,
+        target: Tensor,
+        static_features: Tensor,
+        dynamic_features: Tensor,
     ) -> Tensor:
         """
         Assemble features from target, static features, and the dynamic
diff --git a/test/model/seq2seq/test_forking_sequence_splitter.py b/test/model/seq2seq/test_forking_sequence_splitter.py
index 2087f86551..42d2881bc6 100644
--- a/test/model/seq2seq/test_forking_sequence_splitter.py
+++ b/test/model/seq2seq/test_forking_sequence_splitter.py
@@ -46,7 +46,7 @@ def test_forking_sequence_splitter() -> None:
     dec_len = 3
 
     trans = transform.Chain(
-        trans=[
+        [
             transform.AddAgeFeature(
                 target_field=FieldName.TARGET,
                 output_field="age",
@@ -61,7 +61,7 @@ def test_forking_sequence_splitter() -> None:
         ]
     )
 
-    out = trans(iter(ds), is_train=True)
+    out = trans(ds, is_train=True)
     transformed_data = next(iter(out))
 
     future_target = np.array(

From db57304e34d532492741fa2abf116bbba7bf58fd Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Tue, 14 Apr 2020 14:23:45 +0200
Subject: [PATCH 12/44] Fixing formatting and tests.

---
 src/gluonts/model/estimator.py                |  2 +-
 src/gluonts/model/predictor.py                |  2 +-
 src/gluonts/model/seq2seq/_forking_network.py |  5 +++++
 test/model/seq2seq/test_encoders.py           | 14 ++++++++++----
 test/model/seq2seq/test_model.py              |  4 ++--
 5 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/gluonts/model/estimator.py b/src/gluonts/model/estimator.py
index f3c95aa17b..b48dfa0e3b 100644
--- a/src/gluonts/model/estimator.py
+++ b/src/gluonts/model/estimator.py
@@ -12,7 +12,7 @@
 # permissions and limitations under the License.
 
 # Standard library imports
-from typing import NamedTuple, Optional
+from typing import NamedTuple, Optional, Iterator
 
 # Third-party imports
 import numpy as np
diff --git a/src/gluonts/model/predictor.py b/src/gluonts/model/predictor.py
index edd62ea6c3..a3031107de 100644
--- a/src/gluonts/model/predictor.py
+++ b/src/gluonts/model/predictor.py
@@ -161,7 +161,7 @@ def derive_auto_fields(cls, train_iter):
         return {}
 
     @classmethod
-    def from_inputs(cls, train_iter, params):
+    def from_inputs(cls, train_iter, **params):
         auto_params = cls.derive_auto_fields(train_iter)
         return cls.from_hyperparameters(**auto_params, **params)
 
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index b497a1d482..d8020e8d05 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -14,6 +14,8 @@
 # Third-party imports
 import mxnet as mx
 from mxnet import gluon
+from mxnet import nd
+
 
 # First-party imports
 from gluonts.block.decoder import Seq2SeqDecoder
@@ -24,6 +26,9 @@
 from gluonts.model.common import Tensor
 
 
+nd_None = nd.array([])
+
+
 class ForkingSeq2SeqNetworkBase(gluon.HybridBlock):
     """
     Base network for the :class:`ForkingSeq2SeqEstimator`.
diff --git a/test/model/seq2seq/test_encoders.py b/test/model/seq2seq/test_encoders.py
index 2fa9f35e44..7ac949d6df 100644
--- a/test/model/seq2seq/test_encoders.py
+++ b/test/model/seq2seq/test_encoders.py
@@ -18,8 +18,6 @@
 # First-party imports
 from gluonts.block.encoder import HierarchicalCausalConv1DEncoder
 
-nd_None = nd.array([])
-
 
 @pytest.mark.parametrize("use_residual", [True, False])
 @pytest.mark.parametrize("hybridize", [True, False])
@@ -38,7 +36,12 @@ def test_hierarchical_cnn_encoders(use_residual, hybridize) -> None:
     dial_seq = [1, 3, 9]
 
     cnn = HierarchicalCausalConv1DEncoder(
-        dial_seq, ks_seq, chl_dim, use_residual, use_dynamic_feat=True, use_static_feat=True,
+        dial_seq,
+        ks_seq,
+        chl_dim,
+        use_residual,
+        use_dynamic_feat=True,
+        use_static_feat=True,
     )
     cnn.collect_params().initialize()
 
@@ -47,4 +50,7 @@ def test_hierarchical_cnn_encoders(use_residual, hybridize) -> None:
 
     true_shape = (num_ts, ts_len, 31) if use_residual else (num_ts, ts_len, 30)
 
-    assert cnn(test_data, test_static_feat, test_dynamic_feat)[1].shape == true_shape
+    assert (
+        cnn(test_data, test_static_feat, test_dynamic_feat)[1].shape
+        == true_shape
+    )
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index 6c8487e5e1..ca4cc135f4 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -43,9 +43,9 @@ def Estimator(request):
 
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_accuracy(Estimator, accuracy_test, hyperparameters, hybridize):
-    hyperparameters.update(num_batches_per_epoch=200, hybridize=hybridize)
+    hyperparameters.update(num_batches_per_epoch=100, hybridize=hybridize)
 
-    accuracy_test(Estimator, hyperparameters, accuracy=0.2)
+    accuracy_test(Estimator, hyperparameters, accuracy=0.25)
 
 
 def test_repr(Estimator, repr_test, hyperparameters):

From ecf31e42df5b7fdce05816a65c78ef6a7376f7ff Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Wed, 15 Apr 2020 14:01:06 +0200
Subject: [PATCH 13/44] A lot of TODOs and comments added.

---
 src/gluonts/block/decoder.py                  |  2 ++
 src/gluonts/model/estimator.py                |  6 ++++
 .../model/seq2seq/_forking_estimator.py       |  7 +++-
 src/gluonts/model/seq2seq/_forking_network.py | 13 +++++--
 .../model/seq2seq/_mq_dnn_estimator.py        | 35 ++++++++++++++-----
 .../model/seq2seq/_seq2seq_estimator.py       |  4 +++
 6 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/src/gluonts/block/decoder.py b/src/gluonts/block/decoder.py
index 9b0e26e28f..69eeeaaff0 100644
--- a/src/gluonts/block/decoder.py
+++ b/src/gluonts/block/decoder.py
@@ -52,6 +52,7 @@ def hybrid_forward(
         pass
 
 
+# TODO: add support for static variables
 class ForkingMLPDecoder(Seq2SeqDecoder):
     """
     Multilayer perceptron decoder for sequence-to-sequence models.
@@ -104,6 +105,7 @@ def __init__(
             )
             self.model.add(layer)
 
+    # TODO: add support for static input
     def hybrid_forward(
         self, F, dynamic_input: Tensor, static_input: Tensor = None
     ) -> Tensor:
diff --git a/src/gluonts/model/estimator.py b/src/gluonts/model/estimator.py
index b48dfa0e3b..0668d19ce6 100644
--- a/src/gluonts/model/estimator.py
+++ b/src/gluonts/model/estimator.py
@@ -75,7 +75,13 @@ def derive_auto_fields(cls, train_iter):
 
     @classmethod
     def from_inputs(cls, train_iter, **params):
+        # auto_params usually include `use_feat_dynamic_real`, `use_feat_static_cat` and `cardinality`
         auto_params = cls.derive_auto_fields(train_iter)
+        # FIXME: probably params should take precedence over auto_params, since they were deliberately set,
+        #   however, on that case this method does not make sense, since if params says `use_feat_dynamic_real`=True
+        #   but `auto_params`=False, then this will lead to an error, since the appropriate data does not exist.
+        #   This the only context in which this method makes sense is when auto_params take precedence, which could
+        #   lead to overwriting of explicit parameters. In this case a warning should be issued.
         return cls.from_hyperparameters(**auto_params, **params)
 
 
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 77e49e0f47..eee591bdbb 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -127,6 +127,9 @@ def __init__(
         self.add_time_feature = add_time_feature
         self.add_age_feature = add_age_feature
 
+        # TODO: refactor this variable name: dynamic_network, in fact it
+        #  is not even necessary as is, because this is how use_dynamic_feat was
+        #  set in MQCNNEstimator and otherwise its not used, i.e. False
         # is target only network or not?
         self.dynamic_network = (
             use_dynamic_feat or add_time_feature or add_age_feature
@@ -172,7 +175,9 @@ def create_transformation(self) -> Transformation:
         elif len(dynamic_feat_fields) == 1:
             chain.append(
                 RenameFields(
-                    {dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL}
+                    {
+                        dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL
+                    }  # TODO: find out why this is done
                 )
             )
 
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index d8020e8d05..fc0af163f4 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -72,6 +72,8 @@ def __init__(
             self.loss = quantile_output.get_loss()
 
 
+# TODO: THIS SHOULD NOT EXIST, the if else logic should be handled in
+#  the _forking_estimator.py, and possible assertions too
 class ForkingSeq2SeqNetwork:
     @validated()
     def __init__(
@@ -129,6 +131,8 @@ def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
             raise NotImplementedError
 
 
+# TODO: figure out whether we need 2 classes each, in fact we would need 4 each,
+#  if adding categorical with this technique, does not seem reasonable
 class ForkingSeq2SeqTrainingNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
     def hybrid_forward(
@@ -155,13 +159,18 @@ def hybrid_forward(
 
         # FIXME: can we factor out a common prefix in the base network?
         feat_static_real = F.zeros(shape=(1,))
-        past_feat_dynamic_real = F.zeros(shape=(1,))
+        # TODO: THIS IS OVERWRITING THE ARGUMENT?!?! (REMOVING IT makes add time and age feature work):
+        # past_feat_dynamic_real = F.zeros(shape=(1,))
         future_feat_dynamic_real = F.zeros(shape=(1,))
 
+        # arguments: target, static_features, dynamic_features
         enc_output_static, enc_output_dynamic = self.encoder(
             past_target, feat_static_real, past_feat_dynamic_real
         )
 
+        # arguments: encoder_output_static, encoder_output_dynamic, future_features
+        # TODO: figure out how future_features is supposed to be used: since no distinction
+        #  between dynamic and static anymore (shape is (N, T, C) suggesting dynamic feature)
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
             enc_output_static, enc_output_dynamic, future_feat_dynamic_real
         )
@@ -169,8 +178,6 @@ def hybrid_forward(
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
         dec_dist_output = self.quantile_proj(dec_output)
 
-        # print(f"decoder output: {dec_dist_output.shape}")
-
         loss = self.loss(future_target, dec_dist_output)
         return loss.mean(axis=1)
 
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 362b89a073..62eb9cb28d 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -12,7 +12,7 @@
 # permissions and limitations under the License.
 
 # Standard library imports
-from typing import List, Optional
+from typing import List, Optional, Sized
 
 # First-party imports
 from gluonts.dataset.stat import calculate_dataset_statistics
@@ -33,7 +33,13 @@
 import numpy as np
 import mxnet as mx
 
+# TODO: in general, it seems unnecessary to put the MQCNN and MQRNN into Seq2Seq since their commonality in code with
+#  the rest is just the abstract classes Seq2SeqDecoder and Se2SeqEncoder,
+#  and the Estimator is not based on Seq2SeqEstimator!
 
+
+# TODO: THIS CLASS SHOULD NOT EXIST, the decoder
+#  can be defined in each current subclass
 class MQDNNEstimator(ForkingSeq2SeqEstimator):
     """
     Intermediate base class for a Multi-horizon Quantile Deep Neural Network
@@ -72,7 +78,7 @@ def __init__(
 
         quantile_output = QuantileOutput(quantiles)
 
-        super(MQDNNEstimator, self).__init__(
+        super().__init__(
             encoder=encoder,
             decoder=decoder,
             quantile_output=quantile_output,
@@ -86,6 +92,7 @@ def __init__(
         )
 
 
+# TODO: integrate MQDNN, change arguments to non mutable
 class MQCNNEstimator(MQDNNEstimator):
     """
     An :class:`MQDNNEstimator` with a Convolutional Neural Network (CNN) as an
@@ -99,8 +106,12 @@ def __init__(
         freq: str,
         context_length: Optional[int] = None,
         use_feat_dynamic_real: bool = False,
-        add_time_feature: bool = False,
+        use_feat_static_cat: bool = False,
+        cardinality: Optional[List[int]] = None,
+        # TODO: fix add age and time features, currently dont work
+        #  (might be resolved through commenting out line 161 of _forkin_network.py)
         add_age_feature: bool = False,
+        add_time_feature: bool = False,
         seed: Optional[int] = None,
         decoder_mlp_dim_seq: List[int] = [20],
         channels_seq: List[int] = [30, 30, 30],
@@ -114,10 +125,13 @@ def __init__(
     ) -> None:
 
         use_dynamic_feat_cnn = False
-
         if use_feat_dynamic_real or add_age_feature or add_time_feature:
             use_dynamic_feat_cnn = True
 
+        use_static_feat_cnn = False
+        if use_feat_static_cat or use_feat_static_cat:
+            use_static_feat_cnn = True
+
         if seed:
             np.random.seed(seed)
             mx.random.seed(seed)
@@ -129,18 +143,20 @@ def __init__(
             f"{len(dilation_seq)} vs. {len(kernel_size_seq)}"
         )
 
+        # TODO: figure out whether this needs any additional modification; doesn't seems o
         encoder = HierarchicalCausalConv1DEncoder(
             dilation_seq=dilation_seq,
             kernel_size_seq=channels_seq,
             channels_seq=kernel_size_seq,
             use_residual=use_residual,
             use_dynamic_feat=use_dynamic_feat_cnn,
+            use_static_feat=use_static_feat_cnn,
             prefix="encoder_",
         )
 
-        super(MQCNNEstimator, self).__init__(
+        super().__init__(
             encoder=encoder,
-            use_dynamic_feat=use_feat_dynamic_real,
+            use_dynamic_feat=use_feat_dynamic_real,  # TODO: make use_dynamic_feat this more specific
             add_time_feature=add_time_feature,
             add_age_feature=add_age_feature,
             decoder_mlp_dim_seq=decoder_mlp_dim_seq,
@@ -151,6 +167,7 @@ def __init__(
             quantiles=quantiles,
         )
 
+    # TODO: does this work? I think this might
     @classmethod
     def derive_auto_fields(cls, train_iter):
         stats = calculate_dataset_statistics(train_iter)
@@ -162,6 +179,7 @@ def derive_auto_fields(cls, train_iter):
         }
 
 
+# TODO: integrate MQDNN, change arguments to non mutable
 class MQRNNEstimator(MQDNNEstimator):
     """
     An :class:`MQDNNEstimator` with a Recurrent Neural Network (RNN) as an
@@ -185,7 +203,7 @@ def __init__(
             bidirectional=True,
             prefix="encoder_",
         )
-        super(MQRNNEstimator, self).__init__(
+        super().__init__(
             encoder=encoder,
             decoder_mlp_dim_seq=decoder_mlp_dim_seq,
             freq=freq,
@@ -196,6 +214,7 @@ def __init__(
         )
 
 
+# TODO: REMOVE THIS
 if __name__ == "__main__":
     from gluonts.dataset.repository.datasets import (
         get_dataset,
@@ -232,7 +251,7 @@ def __init__(
         )
 
         agg_metrics, item_metrics = Evaluator()(
-            ts_it, forecast_it, num_series=len(dataset.test)  # type: ignore
+            ts_it, forecast_it, num_series=len(list(dataset.test))
         )
 
         metrics.append(agg_metrics["wQuantileLoss[0.5]"])
diff --git a/src/gluonts/model/seq2seq/_seq2seq_estimator.py b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
index e543887141..2b3211f161 100644
--- a/src/gluonts/model/seq2seq/_seq2seq_estimator.py
+++ b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
@@ -45,6 +45,7 @@
 from ._seq2seq_network import Seq2SeqPredictionNetwork, Seq2SeqTrainingNetwork
 
 
+# TODO: fix mutable arguments
 class Seq2SeqEstimator(GluonEstimator):
     """
     Quantile-Regression Sequence-to-Sequence Estimator
@@ -181,6 +182,7 @@ def create_predictor(
         )
 
 
+# TODO: fix mutable arguments
 class MLP2QRForecaster(Seq2SeqEstimator):
     @validated()
     def __init__(
@@ -215,6 +217,7 @@ def __init__(
         )
 
 
+# TODO: fix mutable arguments
 class RNN2QRForecaster(Seq2SeqEstimator):
     @validated()
     def __init__(
@@ -257,6 +260,7 @@ def __init__(
         )
 
 
+# TODO: fix mutable arguments
 class CNN2QRForecaster(Seq2SeqEstimator):
     @validated()
     def __init__(

From dab925c074fb059ab880475ec2e866e252c9d7a4 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Thu, 16 Apr 2020 12:04:24 +0200
Subject: [PATCH 14/44] Merge from production.

---
 .../model/seq2seq/_forking_estimator.py       | 18 +-----
 src/gluonts/model/seq2seq/_forking_network.py | 64 ++++++++-----------
 .../model/seq2seq/_mq_dnn_estimator.py        | 12 ++--
 src/gluonts/model/seq2seq/_transform.py       |  2 +-
 4 files changed, 34 insertions(+), 62 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index eee591bdbb..e326eed030 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -28,7 +28,6 @@
 from gluonts.support.util import copy_parameters
 from gluonts.trainer import Trainer
 from gluonts.transform import (
-    AsNumpyArray,
     AddAgeFeature,
     AddTimeFeatures,
     Chain,
@@ -41,12 +40,8 @@
 # Relative imports
 from gluonts.time_feature import time_features_from_frequency_str
 from ._forking_network import (
-    ForkingSeq2SeqPredictionNetwork,
-    ForkingSeq2SeqTrainingNetwork,
     ForkingSeq2SeqNetwork,
     ForkingSeq2SeqNetworkBase,
-    ForkingSeq2SeqTargetPredictionNetwork,
-    ForkingSeq2SeqTargetTrainingNetwork,
 )
 from ._transform import ForkingSequenceSplitter
 
@@ -175,10 +170,8 @@ def create_transformation(self) -> Transformation:
         elif len(dynamic_feat_fields) == 1:
             chain.append(
                 RenameFields(
-                    {
-                        dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL
-                    }  # TODO: find out why this is done
-                )
+                    {dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL}
+                )  # TODO: find out why this is done
             )
 
         decoder_field = (
@@ -197,13 +190,6 @@ def create_transformation(self) -> Transformation:
         return Chain(chain)
 
     def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
-        # return ForkingSeq2SeqTrainingNetwork(
-        #     encoder=self.encoder,
-        #     enc2dec=PassThroughEnc2Dec(),
-        #     decoder=self.decoder,
-        #     quantile_output=self.quantile_output,
-        # )
-
         return ForkingSeq2SeqNetwork(
             encoder=self.encoder,
             enc2dec=PassThroughEnc2Dec(),
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index fc0af163f4..026ecdd3d8 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -12,10 +12,7 @@
 # permissions and limitations under the License.
 
 # Third-party imports
-import mxnet as mx
-from mxnet import gluon
-from mxnet import nd
-
+from mxnet import gluon, nd
 
 # First-party imports
 from gluonts.block.decoder import Seq2SeqDecoder
@@ -25,7 +22,6 @@
 from gluonts.core.component import validated
 from gluonts.model.common import Tensor
 
-
 nd_None = nd.array([])
 
 
@@ -63,17 +59,15 @@ def __init__(
         self.decoder = decoder
         self.quantile_output = quantile_output
 
-        # self.feat_static_real = F.zeros(shape=(1,))
-        # self.past_feat_dynamic_real = F.zeros(shape=(1,))
-        # self.future_feat_dynamic_real = F.zeros(shape=(1,))
+        self.feat_static_real = nd_None
+        self.past_feat_dynamic_real = nd_None
+        self.future_feat_dynamic_real = nd_None
 
         with self.name_scope():
             self.quantile_proj = quantile_output.get_quantile_proj()
             self.loss = quantile_output.get_loss()
 
 
-# TODO: THIS SHOULD NOT EXIST, the if else logic should be handled in
-#  the _forking_estimator.py, and possible assertions too
 class ForkingSeq2SeqNetwork:
     @validated()
     def __init__(
@@ -113,7 +107,7 @@ def get_training_network(self) -> ForkingSeq2SeqNetworkBase:
             raise NotImplementedError
 
     def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
-        if not self.use_static_cat and not self.use_dynamic_real:
+        if self.use_static_cat is False and self.use_dynamic_real is False:
             return ForkingSeq2SeqTargetPredictionNetwork(
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
@@ -157,27 +151,29 @@ def hybrid_forward(
         loss with shape (FIXME, FIXME)
         """
 
-        # FIXME: can we factor out a common prefix in the base network?
-        feat_static_real = F.zeros(shape=(1,))
-        # TODO: THIS IS OVERWRITING THE ARGUMENT?!?! (REMOVING IT makes add time and age feature work):
-        # past_feat_dynamic_real = F.zeros(shape=(1,))
-        future_feat_dynamic_real = F.zeros(shape=(1,))
+        # print(f"past target: {past_target.shape}")
+        # print(f"past_feat_dynamic_real: {past_feat_dynamic_real.shape}")
+        # print(f"future_target: {future_target.shape}")
 
         # arguments: target, static_features, dynamic_features
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
+            past_target, self.feat_static_real, past_feat_dynamic_real
         )
 
         # arguments: encoder_output_static, encoder_output_dynamic, future_features
         # TODO: figure out how future_features is supposed to be used: since no distinction
         #  between dynamic and static anymore (shape is (N, T, C) suggesting dynamic feature)
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
+            enc_output_static,
+            enc_output_dynamic,
+            self.future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
         dec_dist_output = self.quantile_proj(dec_output)
 
+        # print(f"decoder output: {dec_dist_output.shape}")
+
         loss = self.loss(future_target, dec_dist_output)
         return loss.mean(axis=1)
 
@@ -202,11 +198,8 @@ def hybrid_forward(
 
         # FIXME: can we factor out a common prefix in the base network?
 
-        feat_static_real = F.zeros(shape=(1,))
-        future_feat_dynamic_real = F.zeros(shape=(1,))
-
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
+            past_target, self.feat_static_real, past_feat_dynamic_real
         )
 
         enc_output_static = (
@@ -214,7 +207,9 @@ def hybrid_forward(
         )
 
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real,
+            enc_output_static,
+            enc_output_dynamic,
+            self.future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
@@ -245,16 +240,14 @@ def hybrid_forward(
         loss with shape (FIXME, FIXME)
         """
 
-        feat_static_real = F.zeros(shape=(1,))
-        past_feat_dynamic_real = F.zeros(shape=(1,))
-        future_feat_dynamic_real = F.zeros(shape=(1,))
-
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
+            past_target, self.feat_static_real, self.past_feat_dynamic_real
         )
 
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
+            enc_output_static,
+            enc_output_dynamic,
+            self.future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
@@ -281,22 +274,19 @@ def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
         """
 
         # FIXME: can we factor out a common prefix in the base network?
-        feat_static_real = F.zeros(shape=(1,))
-        past_feat_dynamic_real = F.zeros(shape=(1,))
-        future_feat_dynamic_real = F.zeros(shape=(1,))
 
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
+            past_target, self.feat_static_real, self.past_feat_dynamic_real
         )
 
         enc_output_static = (
-            F.zeros(shape=(1,))
-            if enc_output_static is None
-            else enc_output_static
+            nd_None if enc_output_static is None else enc_output_static
         )
 
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
+            enc_output_static,
+            enc_output_dynamic,
+            self.future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 62eb9cb28d..0259ccdb2a 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -12,7 +12,7 @@
 # permissions and limitations under the License.
 
 # Standard library imports
-from typing import List, Optional, Sized
+from typing import List, Optional
 
 # First-party imports
 from gluonts.dataset.stat import calculate_dataset_statistics
@@ -108,10 +108,8 @@ def __init__(
         use_feat_dynamic_real: bool = False,
         use_feat_static_cat: bool = False,
         cardinality: Optional[List[int]] = None,
-        # TODO: fix add age and time features, currently dont work
-        #  (might be resolved through commenting out line 161 of _forkin_network.py)
-        add_age_feature: bool = False,
         add_time_feature: bool = False,
+        add_age_feature: bool = False,
         seed: Optional[int] = None,
         decoder_mlp_dim_seq: List[int] = [20],
         channels_seq: List[int] = [30, 30, 30],
@@ -146,11 +144,10 @@ def __init__(
         # TODO: figure out whether this needs any additional modification; doesn't seems o
         encoder = HierarchicalCausalConv1DEncoder(
             dilation_seq=dilation_seq,
-            kernel_size_seq=channels_seq,
-            channels_seq=kernel_size_seq,
+            kernel_size_seq=kernel_size_seq,
+            channels_seq=channels_seq,
             use_residual=use_residual,
             use_dynamic_feat=use_dynamic_feat_cnn,
-            use_static_feat=use_static_feat_cnn,
             prefix="encoder_",
         )
 
@@ -167,7 +164,6 @@ def __init__(
             quantiles=quantiles,
         )
 
-    # TODO: does this work? I think this might
     @classmethod
     def derive_auto_fields(cls, train_iter):
         stats = calculate_dataset_statistics(train_iter)
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index d8f3564a63..72df2a0d8e 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -130,8 +130,8 @@ def flatmap_transform(
                     forking_dec_field = np.zeros(
                         shape=(self.enc_len, self.dec_len) + d3
                     )
-                    skip = max(0, self.enc_len - sampling_idx)
 
+                    skip = max(0, self.enc_len - 1 - sampling_idx)
                     for dec_field, idx in zip(
                         forking_dec_field[skip:],
                         range(start_idx + 1, start_idx + self.enc_len + 1),

From 923510bdb881411a9787241dc406e1127a688b95 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Thu, 16 Apr 2020 13:30:53 +0200
Subject: [PATCH 15/44] Fixing mq_dnn single quantile error and type errors.

---
 src/gluonts/block/quantile_output.py          |  9 ++-
 src/gluonts/model/seq2seq/_forking_network.py | 66 ++++++++++---------
 .../model/seq2seq/_mq_dnn_estimator.py        |  1 +
 src/gluonts/model/seq2seq/_transform.py       |  2 +-
 test/model/seq2seq/test_model.py              |  9 ++-
 5 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/src/gluonts/block/quantile_output.py b/src/gluonts/block/quantile_output.py
index 9b0fd38950..3ae4f54d62 100644
--- a/src/gluonts/block/quantile_output.py
+++ b/src/gluonts/block/quantile_output.py
@@ -86,9 +86,12 @@ def hybrid_forward(
         Tensor
             weighted sum of the quantile losses, shape N1 x N1 x ... Nk
         """
-        y_pred_all = F.split(
-            y_pred, axis=-1, num_outputs=self.num_quantiles, squeeze_axis=1
-        )
+        if self.num_quantiles > 1:
+            y_pred_all = F.split(
+                y_pred, axis=-1, num_outputs=self.num_quantiles, squeeze_axis=1
+            )
+        else:
+            y_pred_all = [F.squeeze(y_pred, axis=-1)]
 
         qt_loss = []
         for i, y_pred_q in enumerate(y_pred_all):
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 026ecdd3d8..0d5acc6bc7 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -12,7 +12,9 @@
 # permissions and limitations under the License.
 
 # Third-party imports
-from mxnet import gluon, nd
+import mxnet as mx
+from mxnet import gluon
+
 
 # First-party imports
 from gluonts.block.decoder import Seq2SeqDecoder
@@ -22,8 +24,6 @@
 from gluonts.core.component import validated
 from gluonts.model.common import Tensor
 
-nd_None = nd.array([])
-
 
 class ForkingSeq2SeqNetworkBase(gluon.HybridBlock):
     """
@@ -59,15 +59,13 @@ def __init__(
         self.decoder = decoder
         self.quantile_output = quantile_output
 
-        self.feat_static_real = nd_None
-        self.past_feat_dynamic_real = nd_None
-        self.future_feat_dynamic_real = nd_None
-
         with self.name_scope():
             self.quantile_proj = quantile_output.get_quantile_proj()
             self.loss = quantile_output.get_loss()
 
 
+# TODO: THIS SHOULD NOT EXIST, the if else logic should be handled in
+#  the _forking_estimator.py, and possible assertions too
 class ForkingSeq2SeqNetwork:
     @validated()
     def __init__(
@@ -107,7 +105,7 @@ def get_training_network(self) -> ForkingSeq2SeqNetworkBase:
             raise NotImplementedError
 
     def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
-        if self.use_static_cat is False and self.use_dynamic_real is False:
+        if not self.use_static_cat and not self.use_dynamic_real:
             return ForkingSeq2SeqTargetPredictionNetwork(
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
@@ -151,29 +149,27 @@ def hybrid_forward(
         loss with shape (FIXME, FIXME)
         """
 
-        # print(f"past target: {past_target.shape}")
-        # print(f"past_feat_dynamic_real: {past_feat_dynamic_real.shape}")
-        # print(f"future_target: {future_target.shape}")
+        # FIXME: can we factor out a common prefix in the base network?
+        feat_static_real = F.zeros(shape=(1,))
+        # TODO: THIS IS OVERWRITING THE ARGUMENT?!?! (REMOVING IT makes add time and age feature work):
+        # past_feat_dynamic_real = F.zeros(shape=(1,))
+        future_feat_dynamic_real = F.zeros(shape=(1,))
 
         # arguments: target, static_features, dynamic_features
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, self.feat_static_real, past_feat_dynamic_real
+            past_target, feat_static_real, past_feat_dynamic_real
         )
 
         # arguments: encoder_output_static, encoder_output_dynamic, future_features
         # TODO: figure out how future_features is supposed to be used: since no distinction
         #  between dynamic and static anymore (shape is (N, T, C) suggesting dynamic feature)
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static,
-            enc_output_dynamic,
-            self.future_feat_dynamic_real,
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
         )
 
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
         dec_dist_output = self.quantile_proj(dec_output)
 
-        # print(f"decoder output: {dec_dist_output.shape}")
-
         loss = self.loss(future_target, dec_dist_output)
         return loss.mean(axis=1)
 
@@ -198,18 +194,21 @@ def hybrid_forward(
 
         # FIXME: can we factor out a common prefix in the base network?
 
+        feat_static_real = F.zeros(shape=(1,))
+        future_feat_dynamic_real = F.zeros(shape=(1,))
+
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, self.feat_static_real, past_feat_dynamic_real
+            past_target, feat_static_real, past_feat_dynamic_real
         )
 
         enc_output_static = (
-            nd_None if enc_output_static is None else enc_output_static
+            F.zeros(shape=(1,))
+            if enc_output_static is None
+            else enc_output_static
         )
 
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static,
-            enc_output_dynamic,
-            self.future_feat_dynamic_real,
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
@@ -240,14 +239,16 @@ def hybrid_forward(
         loss with shape (FIXME, FIXME)
         """
 
+        feat_static_real = F.zeros(shape=(1,))
+        past_feat_dynamic_real = F.zeros(shape=(1,))
+        future_feat_dynamic_real = F.zeros(shape=(1,))
+
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, self.feat_static_real, self.past_feat_dynamic_real
+            past_target, feat_static_real, past_feat_dynamic_real
         )
 
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static,
-            enc_output_dynamic,
-            self.future_feat_dynamic_real,
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
         )
 
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
@@ -274,19 +275,22 @@ def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
         """
 
         # FIXME: can we factor out a common prefix in the base network?
+        feat_static_real = F.zeros(shape=(1,))
+        past_feat_dynamic_real = F.zeros(shape=(1,))
+        future_feat_dynamic_real = F.zeros(shape=(1,))
 
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, self.feat_static_real, self.past_feat_dynamic_real
+            past_target, feat_static_real, past_feat_dynamic_real
         )
 
         enc_output_static = (
-            nd_None if enc_output_static is None else enc_output_static
+            F.zeros(shape=(1,))
+            if enc_output_static is None
+            else enc_output_static
         )
 
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static,
-            enc_output_dynamic,
-            self.future_feat_dynamic_real,
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 0259ccdb2a..fdc60fd4ab 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -148,6 +148,7 @@ def __init__(
             channels_seq=channels_seq,
             use_residual=use_residual,
             use_dynamic_feat=use_dynamic_feat_cnn,
+            # use_static_feat=use_static_feat_cnn,  # TODO: enable this
             prefix="encoder_",
         )
 
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index 72df2a0d8e..1943e1d0c4 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -131,7 +131,7 @@ def flatmap_transform(
                         shape=(self.enc_len, self.dec_len) + d3
                     )
 
-                    skip = max(0, self.enc_len - 1 - sampling_idx)
+                    skip = max(0, self.enc_len - sampling_idx)
                     for dec_field, idx in zip(
                         forking_dec_field[skip:],
                         range(start_idx + 1, start_idx + self.enc_len + 1),
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index ca4cc135f4..db78e1eddf 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -41,9 +41,14 @@ def Estimator(request):
     return request.param
 
 
+@pytest.mark.parametrize("quantiles", [[0.1, 0.5, 0.9], [0.5]])
 @pytest.mark.parametrize("hybridize", [True, False])
-def test_accuracy(Estimator, accuracy_test, hyperparameters, hybridize):
-    hyperparameters.update(num_batches_per_epoch=100, hybridize=hybridize)
+def test_accuracy(
+    Estimator, accuracy_test, hyperparameters, hybridize, quantiles
+):
+    hyperparameters.update(
+        num_batches_per_epoch=100, hybridize=hybridize, quantiles=quantiles
+    )
 
     accuracy_test(Estimator, hyperparameters, accuracy=0.25)
 

From e5530955f8a6b4905615534fd0300c4d2b35d12f Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Fri, 17 Apr 2020 14:08:13 +0200
Subject: [PATCH 16/44] Refactoring dnn_estimator file.

---
 .../model/seq2seq/_forking_estimator.py       |  33 ++-
 .../model/seq2seq/_mq_dnn_estimator.py        | 201 ++++++------------
 2 files changed, 90 insertions(+), 144 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index e326eed030..5bf56ef1a8 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -101,14 +101,23 @@ def __init__(
         context_length: Optional[int] = None,
         trainer: Trainer = Trainer(),
     ) -> None:
+        super().__init__(trainer=trainer)
+
         assert (
             context_length is None or context_length > 0
         ), "The value of `context_length` should be > 0"
         assert (
             prediction_length > 0
         ), "The value of `prediction_length` should be > 0"
-
-        super().__init__(trainer=trainer)
+        # assert (cardinality and use_feat_static_cat) or (
+        #     not (cardinality or use_feat_static_cat)
+        # ), "You should set `cardinality` if and only if `use_feat_static_cat=True`"
+        # assert cardinality is None or all(
+        #     [c > 0 for c in cardinality]
+        # ), "Elements of `cardinality` should be > 0"
+        # assert embedding_dimension is None or all(
+        #     [e > 0 for e in embedding_dimension]
+        # ), "Elements of `embedding_dimension` should be > 0"
 
         self.encoder = encoder
         self.decoder = decoder
@@ -122,14 +131,26 @@ def __init__(
         self.add_time_feature = add_time_feature
         self.add_age_feature = add_age_feature
 
+        # self.use_feat_static_cat = use_feat_static_cat
+        # self.use_feat_dynamic_real = use_feat_dynamic_real
+        # self.cardinality = (
+        #     cardinality if cardinality and use_feat_static_cat else [1]
+        # )
+        # self.embedding_dimension = (
+        #     embedding_dimension
+        #     if embedding_dimension is not None
+        #     else [min(50, (cat + 1) // 2) for cat in self.cardinality]
+        # )
+
         # TODO: refactor this variable name: dynamic_network, in fact it
         #  is not even necessary as is, because this is how use_dynamic_feat was
         #  set in MQCNNEstimator and otherwise its not used, i.e. False
         # is target only network or not?
-        self.dynamic_network = (
+        self.use_dynamic_real = (
             use_dynamic_feat or add_time_feature or add_age_feature
         )
-        print(f"use_dynamic_network: {self.dynamic_network}")
+
+        print(f"use_dynamic_network: {self.use_dynamic_real}")
 
     def create_transformation(self) -> Transformation:
         chain = []
@@ -195,7 +216,7 @@ def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
             enc2dec=PassThroughEnc2Dec(),
             decoder=self.decoder,
             quantile_output=self.quantile_output,
-            use_dynamic_real=self.dynamic_network,
+            use_dynamic_real=self.use_dynamic_real,
         ).get_training_network()
 
     def create_predictor(
@@ -214,7 +235,7 @@ def create_predictor(
             enc2dec=trained_network.enc2dec,
             decoder=trained_network.decoder,
             quantile_output=trained_network.quantile_output,
-            use_dynamic_real=self.dynamic_network,
+            use_dynamic_real=self.use_dynamic_real,
         ).get_prediction_network()
 
         copy_parameters(trained_network, prediction_network)
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index fdc60fd4ab..67246f7f48 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -14,100 +14,42 @@
 # Standard library imports
 from typing import List, Optional
 
+# Third-party imports
+import numpy as np
+import mxnet as mx
+
 # First-party imports
 from gluonts.dataset.stat import calculate_dataset_statistics
-from gluonts.evaluation.backtest import make_evaluation_predictions
 from gluonts.block.decoder import ForkingMLPDecoder
-from gluonts.block.encoder import (
-    HierarchicalCausalConv1DEncoder,
-    RNNEncoder,
-    Seq2SeqEncoder,
-)
+from gluonts.block.encoder import HierarchicalCausalConv1DEncoder, RNNEncoder
 from gluonts.block.quantile_output import QuantileOutput
 from gluonts.core.component import validated
 from gluonts.trainer import Trainer
-
-# Relative imports
 from gluonts.model.seq2seq._forking_estimator import ForkingSeq2SeqEstimator
-from gluonts.evaluation import Evaluator
-import numpy as np
-import mxnet as mx
+
 
 # TODO: in general, it seems unnecessary to put the MQCNN and MQRNN into Seq2Seq since their commonality in code with
 #  the rest is just the abstract classes Seq2SeqDecoder and Se2SeqEncoder,
 #  and the Estimator is not based on Seq2SeqEstimator!
 
 
-# TODO: THIS CLASS SHOULD NOT EXIST, the decoder
-#  can be defined in each current subclass
-class MQDNNEstimator(ForkingSeq2SeqEstimator):
-    """
-    Intermediate base class for a Multi-horizon Quantile Deep Neural Network
-    (MQ-DNN), [WTN+17]_. The class fixes the decoder is a multi-quantile MLP.
-    Subclasses fix the encoder to be either a Convolutional Neural Network
-    (MQ-CNN) or a Recurrent Neural Network (MQ-RNN).
-    """
-
-    @validated()
-    def __init__(
-        self,
-        encoder: Seq2SeqEncoder,
-        context_length: Optional[int],
-        prediction_length: int,
-        freq: str,
-        use_dynamic_feat: bool = False,
-        add_time_feature: bool = False,
-        add_age_feature: bool = False,
-        decoder_mlp_dim_seq: List[int] = [20],
-        quantiles: List[float] = list(),
-        trainer: Trainer = Trainer(),
-    ) -> None:
-        context_length = (
-            prediction_length if context_length is None else context_length
-        )
-        assert all(
-            [d > 0 for d in decoder_mlp_dim_seq]
-        ), "Elements of `mlp_hidden_dimension_seq` should be > 0"
-
-        decoder = ForkingMLPDecoder(
-            dec_len=prediction_length,
-            final_dim=decoder_mlp_dim_seq[-1],
-            hidden_dimension_sequence=decoder_mlp_dim_seq[:-1],
-            prefix="decoder_",
-        )
-
-        quantile_output = QuantileOutput(quantiles)
-
-        super().__init__(
-            encoder=encoder,
-            decoder=decoder,
-            quantile_output=quantile_output,
-            freq=freq,
-            use_dynamic_feat=use_dynamic_feat,
-            add_age_feature=add_age_feature,
-            add_time_feature=add_time_feature,
-            prediction_length=prediction_length,
-            context_length=context_length,
-            trainer=trainer,
-        )
-
-
 # TODO: integrate MQDNN, change arguments to non mutable
-class MQCNNEstimator(MQDNNEstimator):
+class MQCNNEstimator(ForkingSeq2SeqEstimator):
     """
     An :class:`MQDNNEstimator` with a Convolutional Neural Network (CNN) as an
-    encoder. Implements the MQ-CNN Forecaster, proposed in [WTN+17]_.
+    encoder and a multi-quantile MLP as a decoder. Implements the MQ-CNN Forecaster, proposed in [WTN+17]_.
     """
 
     @validated()
     def __init__(
         self,
-        prediction_length: int,
         freq: str,
+        prediction_length: int,
         context_length: Optional[int] = None,
         use_feat_dynamic_real: bool = False,
         use_feat_static_cat: bool = False,
-        cardinality: Optional[List[int]] = None,
+        cardinality: List[int] = None,
+        embedding_dimension: List[int] = None,
         add_time_feature: bool = False,
         add_age_feature: bool = False,
         seed: Optional[int] = None,
@@ -122,14 +64,6 @@ def __init__(
         trainer: Trainer = Trainer(),
     ) -> None:
 
-        use_dynamic_feat_cnn = False
-        if use_feat_dynamic_real or add_age_feature or add_time_feature:
-            use_dynamic_feat_cnn = True
-
-        use_static_feat_cnn = False
-        if use_feat_static_cat or use_feat_static_cat:
-            use_static_feat_cnn = True
-
         if seed:
             np.random.seed(seed)
             mx.random.seed(seed)
@@ -140,29 +74,47 @@ def __init__(
             f"mismatch CNN configurations: {len(channels_seq)} vs. "
             f"{len(dilation_seq)} vs. {len(kernel_size_seq)}"
         )
+        assert (
+            prediction_length > 0
+        ), f"Invalid prediction length: {prediction_length}."
+        assert all(
+            [d > 0 for d in decoder_mlp_dim_seq]
+        ), "Elements of `mlp_hidden_dimension_seq` should be > 0"
+
+        use_dynamic_feat = (
+            use_feat_dynamic_real or add_age_feature or add_time_feature
+        )
 
-        # TODO: figure out whether this needs any additional modification; doesn't seems o
         encoder = HierarchicalCausalConv1DEncoder(
             dilation_seq=dilation_seq,
             kernel_size_seq=kernel_size_seq,
             channels_seq=channels_seq,
             use_residual=use_residual,
-            use_dynamic_feat=use_dynamic_feat_cnn,
-            # use_static_feat=use_static_feat_cnn,  # TODO: enable this
+            use_dynamic_feat=use_dynamic_feat,
+            use_static_feat=use_feat_static_cat,
             prefix="encoder_",
         )
 
+        decoder = ForkingMLPDecoder(
+            dec_len=prediction_length,
+            final_dim=decoder_mlp_dim_seq[-1],
+            hidden_dimension_sequence=decoder_mlp_dim_seq[:-1],
+            prefix="decoder_",
+        )
+
+        quantile_output = QuantileOutput(quantiles)
+
         super().__init__(
             encoder=encoder,
-            use_dynamic_feat=use_feat_dynamic_real,  # TODO: make use_dynamic_feat this more specific
-            add_time_feature=add_time_feature,
-            add_age_feature=add_age_feature,
-            decoder_mlp_dim_seq=decoder_mlp_dim_seq,
+            decoder=decoder,
+            quantile_output=quantile_output,
             freq=freq,
             prediction_length=prediction_length,
-            trainer=trainer,
             context_length=context_length,
-            quantiles=quantiles,
+            use_dynamic_feat=use_dynamic_feat,
+            add_time_feature=add_time_feature,
+            add_age_feature=add_age_feature,
+            trainer=trainer,
         )
 
     @classmethod
@@ -171,16 +123,16 @@ def derive_auto_fields(cls, train_iter):
 
         return {
             "use_feat_dynamic_real": stats.num_feat_dynamic_real > 0,
-            # "use_feat_static_cat": bool(stats.feat_static_cat),
-            # "cardinality": [len(cats) for cats in stats.feat_static_cat],
+            "use_feat_static_cat": bool(stats.feat_static_cat),
+            "cardinality": [len(cats) for cats in stats.feat_static_cat],
         }
 
 
 # TODO: integrate MQDNN, change arguments to non mutable
-class MQRNNEstimator(MQDNNEstimator):
+class MQRNNEstimator(ForkingSeq2SeqEstimator):
     """
     An :class:`MQDNNEstimator` with a Recurrent Neural Network (RNN) as an
-    encoder. Implements the MQ-RNN Forecaster, proposed in [WTN+17]_.
+    encoder and a multi-quantile MLP as a decoder. Implements the MQ-RNN Forecaster, proposed in [WTN+17]_.
     """
 
     @validated()
@@ -193,6 +145,14 @@ def __init__(
         trainer: Trainer = Trainer(),
         quantiles: List[float] = list([0.1, 0.5, 0.9]),
     ) -> None:
+
+        assert (
+            prediction_length > 0
+        ), f"Invalid prediction length: {prediction_length}."
+        assert all(
+            [d > 0 for d in decoder_mlp_dim_seq]
+        ), "Elements of `mlp_hidden_dimension_seq` should be > 0"
+
         encoder = RNNEncoder(
             mode="gru",
             hidden_size=50,
@@ -200,57 +160,22 @@ def __init__(
             bidirectional=True,
             prefix="encoder_",
         )
+
+        decoder = ForkingMLPDecoder(
+            dec_len=prediction_length,
+            final_dim=decoder_mlp_dim_seq[-1],
+            hidden_dimension_sequence=decoder_mlp_dim_seq[:-1],
+            prefix="decoder_",
+        )
+
+        quantile_output = QuantileOutput(quantiles)
+
         super().__init__(
             encoder=encoder,
-            decoder_mlp_dim_seq=decoder_mlp_dim_seq,
+            decoder=decoder,
+            quantile_output=quantile_output,
             freq=freq,
             prediction_length=prediction_length,
-            trainer=trainer,
             context_length=context_length,
-            quantiles=quantiles,
-        )
-
-
-# TODO: REMOVE THIS
-if __name__ == "__main__":
-    from gluonts.dataset.repository.datasets import (
-        get_dataset,
-        dataset_recipes,
-    )
-
-    print(f"datasets available: {dataset_recipes.keys()}")
-
-    # we pick m4_hourly as it only contains a few hundred time series
-    dataset = get_dataset("m4_hourly", regenerate=False)
-
-    metrics = []
-
-    for _ in range(1):
-        estimator = MQCNNEstimator.from_inputs(
-            dataset.train,
-            # add_time_feature=True,
-            # add_age_feature=True,
-            prediction_length=dataset.metadata.prediction_length,
-            seed=42,
-            freq=dataset.metadata.freq,
-            quantiles=[0.5],
-            trainer=Trainer(
-                epochs=1, num_batches_per_epoch=10, hybridize=True
-            ),
-        )
-
-        predictor = estimator.train(dataset.train)
-
-        assert dataset.test is not None
-
-        forecast_it, ts_it = make_evaluation_predictions(
-            dataset.test, predictor=predictor, num_samples=100
-        )
-
-        agg_metrics, item_metrics = Evaluator()(
-            ts_it, forecast_it, num_series=len(list(dataset.test))
+            trainer=trainer,
         )
-
-        metrics.append(agg_metrics["wQuantileLoss[0.5]"])
-
-    print(metrics)

From 1ba58718118775fdca85baed2795886bd8766aae Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Fri, 17 Apr 2020 17:48:20 +0200
Subject: [PATCH 17/44] Adding additional tests, minor bugfix.

---
 .../model/seq2seq/_forking_estimator.py       | 37 ++++++++++++++++++-
 src/gluonts/model/seq2seq/_forking_network.py | 15 ++++----
 .../model/seq2seq/_mq_dnn_estimator.py        |  6 +--
 test/model/seq2seq/test_model.py              | 34 ++++++++++++++++-
 4 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 5bf56ef1a8..2b086a1acd 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -35,6 +35,8 @@
     Transformation,
     VstackFeatures,
     RenameFields,
+    SetField,
+    RemoveFields,
 )
 
 # Relative imports
@@ -153,6 +155,12 @@ def __init__(
         print(f"use_dynamic_network: {self.use_dynamic_real}")
 
     def create_transformation(self) -> Transformation:
+        # remove_field_names = [FieldName.FEAT_DYNAMIC_CAT]
+        # if not self.use_feat_static_real:
+        #     remove_field_names.append(FieldName.FEAT_STATIC_REAL)
+        # if not self.use_feat_dynamic_real:
+        #     remove_field_names.append(FieldName.FEAT_DYNAMIC_REAL)
+
         chain = []
         dynamic_feat_fields = []
 
@@ -188,18 +196,43 @@ def create_transformation(self) -> Transformation:
                     input_fields=dynamic_feat_fields,
                 )
             )
-        elif len(dynamic_feat_fields) == 1:
+        elif (
+            len(dynamic_feat_fields) == 1
+            and FieldName.FEAT_DYNAMIC_REAL not in dynamic_feat_fields
+        ):
             chain.append(
                 RenameFields(
                     {dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL}
-                )  # TODO: find out why this is done
+                )
             )
 
+        # TODO: current problem: cannot have no input, if some input provided, because the decoder will not
+        #  accept input, however, the batches contain input, and python complains that
+        #  it cannot map something to nothing?
+
+        # if dynamic_feat_fields:
+        #     chain.append(
+        #         VstackFeatures(
+        #             output_field=FieldName.FEAT_DYNAMIC_REAL,
+        #             input_fields=dynamic_feat_fields,
+        #         )
+        #     )
+        # else:
+        #     # Unfortunately we always need to pass something.
+        #     # Passing a constant does not have an effect on performance and essentially acts as a bias term.
+        #     SetField(
+        #         output_field=FieldName.FEAT_DYNAMIC_REAL, value=[0.0]
+        #     )
+        #     dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
+
+        # So far the decoder only uses dynamic real
         decoder_field = (
             [FieldName.FEAT_DYNAMIC_REAL] if dynamic_feat_fields else []
         )
 
         chain.append(
+            # because of how the forking decoder works, every time step
+            # in context is used for splitting, which is why we use the TestSplitSampler
             ForkingSequenceSplitter(
                 train_sampler=TestSplitSampler(),
                 enc_len=self.context_length,
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 0d5acc6bc7..c01ef4c3cf 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -86,41 +86,40 @@ def __init__(
         self.use_dynamic_real = use_dynamic_real
         self.use_static_cat = use_static_cat
 
+        # TODO: add this feature:
+        assert self.use_static_cat is False
+
     def get_training_network(self) -> ForkingSeq2SeqNetworkBase:
-        if self.use_static_cat is False and self.use_dynamic_real is False:
+        if self.use_dynamic_real is False:
             return ForkingSeq2SeqTargetTrainingNetwork(
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
                 decoder=self.decoder,
                 quantile_output=self.quantile_output,
             )
-        elif self.use_static_cat is False and self.use_dynamic_real:
+        else:
             return ForkingSeq2SeqTrainingNetwork(
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
                 decoder=self.decoder,
                 quantile_output=self.quantile_output,
             )
-        else:
-            raise NotImplementedError
 
     def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
-        if not self.use_static_cat and not self.use_dynamic_real:
+        if self.use_dynamic_real is False:
             return ForkingSeq2SeqTargetPredictionNetwork(
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
                 decoder=self.decoder,
                 quantile_output=self.quantile_output,
             )
-        elif self.use_static_cat is False and self.use_dynamic_real:
+        else:
             return ForkingSeq2SeqPredictionNetwork(
                 encoder=self.encoder,
                 enc2dec=self.enc2dec,
                 decoder=self.decoder,
                 quantile_output=self.quantile_output,
             )
-        else:
-            raise NotImplementedError
 
 
 # TODO: figure out whether we need 2 classes each, in fact we would need 4 each,
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 67246f7f48..96cd710791 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -91,7 +91,7 @@ def __init__(
             channels_seq=channels_seq,
             use_residual=use_residual,
             use_dynamic_feat=use_dynamic_feat,
-            use_static_feat=use_feat_static_cat,
+            # use_static_feat=use_feat_static_cat,
             prefix="encoder_",
         )
 
@@ -123,8 +123,8 @@ def derive_auto_fields(cls, train_iter):
 
         return {
             "use_feat_dynamic_real": stats.num_feat_dynamic_real > 0,
-            "use_feat_static_cat": bool(stats.feat_static_cat),
-            "cardinality": [len(cats) for cats in stats.feat_static_cat],
+            # "use_feat_static_cat": bool(stats.feat_static_cat),
+            # "cardinality": [len(cats) for cats in stats.feat_static_cat],
         }
 
 
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index db78e1eddf..4502cf5969 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -16,8 +16,8 @@
 from gluonts.model.seq2seq import (
     MQCNNEstimator,
     MQRNNEstimator,
-    Seq2SeqEstimator,
 )
+from gluonts.testutil.dummy_datasets import make_dummy_datasets_with_features
 
 
 @pytest.fixture()
@@ -53,6 +53,38 @@ def test_accuracy(
     accuracy_test(Estimator, hyperparameters, accuracy=0.25)
 
 
+@pytest.mark.parametrize("use_feat_dynamic_real", [True, False])
+@pytest.mark.parametrize("add_time_feature", [True, False])
+@pytest.mark.parametrize("add_age_feature", [True, False])
+def test_mqcnn_covariate_smoke_test(
+    use_feat_dynamic_real, add_time_feature, add_age_feature
+):
+    hps = {
+        "seed": 42,
+        "freq": "D",
+        "prediction_length": 3,
+        "quantiles": [0.5, 0.1],
+        "epochs": 3,
+        "num_batches_per_epoch": 3,
+        "use_feat_dynamic_real": use_feat_dynamic_real,
+        "add_time_feature": add_time_feature,
+        "add_age_feature": add_age_feature,
+    }
+
+    dataset_train, dataset_test = make_dummy_datasets_with_features(
+        cardinality=[3, 10, 42],
+        num_feat_dynamic_real=3,
+        freq=hps["freq"],
+        prediction_length=hps["prediction_length"],
+    )
+
+    estimator = MQCNNEstimator.from_hyperparameters(**hps)
+
+    predictor = estimator.train(dataset_train)
+    forecasts = list(predictor.predict(dataset_test))
+    assert len(forecasts) == len(dataset_test)
+
+
 def test_repr(Estimator, repr_test, hyperparameters):
     repr_test(Estimator, hyperparameters)
 

From 8ba66e0e2c016d21f5defbac270583f094e83c4c Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Fri, 17 Apr 2020 23:25:45 +0200
Subject: [PATCH 18/44] Major refactoring that allows for disabling inputs at
 will. All tests pass.

---
 src/gluonts/block/decoder.py                  |   4 +-
 src/gluonts/block/encoder.py                  | 117 +++---------
 src/gluonts/dataset/field_names.py            |   2 +
 .../model/seq2seq/_forking_estimator.py       | 108 ++++++-----
 src/gluonts/model/seq2seq/_forking_network.py | 168 +++---------------
 .../model/seq2seq/_mq_dnn_estimator.py        |  12 +-
 .../model/seq2seq/_seq2seq_estimator.py       |   6 +-
 src/gluonts/model/seq2seq/_transform.py       |   4 +
 src/gluonts/transform/field.py                |  10 +-
 test/model/seq2seq/test_model.py              |   1 +
 10 files changed, 133 insertions(+), 299 deletions(-)

diff --git a/src/gluonts/block/decoder.py b/src/gluonts/block/decoder.py
index 69eeeaaff0..deabc87175 100644
--- a/src/gluonts/block/decoder.py
+++ b/src/gluonts/block/decoder.py
@@ -52,7 +52,7 @@ def hybrid_forward(
         pass
 
 
-# TODO: add support for static variables
+# TODO: add support for static variables at some point
 class ForkingMLPDecoder(Seq2SeqDecoder):
     """
     Multilayer perceptron decoder for sequence-to-sequence models.
@@ -105,7 +105,7 @@ def __init__(
             )
             self.model.add(layer)
 
-    # TODO: add support for static input
+    # TODO: add support for static input at some point
     def hybrid_forward(
         self, F, dynamic_input: Tensor, static_input: Tensor = None
     ) -> Tensor:
diff --git a/src/gluonts/block/encoder.py b/src/gluonts/block/encoder.py
index 77d7ea4f68..f3f7f23ce2 100644
--- a/src/gluonts/block/encoder.py
+++ b/src/gluonts/block/encoder.py
@@ -236,6 +236,8 @@ def hybrid_forward(
         else:
             inputs = target
 
+        print("Been here done that.")
+
         # NTC -> NCT (or NCW)
         ct = inputs.swapaxes(1, 2)
         ct = self.cnn(ct)
@@ -248,12 +250,15 @@ def hybrid_forward(
         # return the last state as the static code
         static_code = F.slice_axis(ct, axis=1, begin=-1, end=None)
         static_code = F.squeeze(static_code, axis=1)
+
+        print("Been here done that. 2.")
+
         return static_code, ct
 
 
 class RNNEncoder(Seq2SeqEncoder):
     """
-    Defines an RNN as the encoder.
+     Defines RNN encoder that uses covariates and target as input to the RNN if desired.
 
     Parameters
     ----------
@@ -278,12 +283,20 @@ def __init__(
         hidden_size: int,
         num_layers: int,
         bidirectional: bool,
+        use_static_feat: bool = False,
+        use_dynamic_feat: bool = False,
         **kwargs,
     ) -> None:
         assert num_layers > 0, "`num_layers` value must be greater than zero"
         assert hidden_size > 0, "`hidden_size` value must be greater than zero"
 
         super().__init__(**kwargs)
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.use_static_feat = use_static_feat
+        self.use_dynamic_feat = use_dynamic_feat
 
         with self.name_scope():
             self.rnn = RNN(mode, hidden_size, num_layers, bidirectional)
@@ -324,7 +337,19 @@ def hybrid_forward(
             dynamic code,
             shape (batch_size, sequence_length, num_dynamic_features)
         """
-        dynamic_code = self.rnn(target)
+        if self.use_dynamic_feat and self.use_static_feat:
+            inputs = self._assemble_inputs(
+                F,
+                target=target,
+                static_features=static_features,
+                dynamic_features=dynamic_features,
+            )
+        elif self.use_dynamic_feat:
+            inputs = F.concat(target, dynamic_features, dim=2)  # (N, T, C)
+        else:
+            inputs = target
+
+        dynamic_code = self.rnn(inputs)
         static_code = F.slice_axis(dynamic_code, axis=1, begin=-1, end=None)
         return static_code, dynamic_code
 
@@ -388,91 +413,3 @@ def hybrid_forward(
         static_code = self.model(inputs)
         dynamic_code = F.zeros_like(target).expand_dims(2)
         return static_code, dynamic_code
-
-
-class RNNCovariateEncoder(Seq2SeqEncoder):
-    """
-    Defines RNN encoder that uses covariates and target as input to the RNN.
-
-    Parameters
-    ----------
-    mode
-        type of the RNN. Can be either: rnn_relu (RNN with relu activation),
-        rnn_tanh, (RNN with tanh activation), lstm or gru.
-
-    hidden_size
-        number of units per hidden layer.
-
-    num_layers
-        number of hidden layers.
-
-    bidirectional
-        toggle use of bi-directional RNN as encoder.
-    """
-
-    @validated()
-    def __init__(
-        self,
-        mode: str,
-        hidden_size: int,
-        num_layers: int,
-        bidirectional: bool,
-        **kwargs,
-    ) -> None:
-
-        assert num_layers > 0, "`num_layers` value must be greater than zero"
-        assert hidden_size > 0, "`hidden_size` value must be greater than zero"
-
-        super().__init__(**kwargs)
-
-        with self.name_scope():
-            self.rnn = RNN(mode, hidden_size, num_layers, bidirectional)
-
-    def hybrid_forward(
-        self,
-        F,
-        target: Tensor,
-        static_features: Tensor,
-        dynamic_features: Tensor,
-    ) -> Tuple[Tensor, Tensor]:
-        """
-        Parameters
-        ----------
-        F
-            A module that can either refer to the Symbol API or the NDArray
-            API in MXNet.
-
-        target
-            target time series,
-            shape (batch_size, sequence_length, 1)
-
-        static_features
-            static features,
-            shape (batch_size, num_static_features)
-
-        dynamic_features
-            dynamic_features,
-            shape (batch_size, sequence_length, num_dynamic_features)
-
-        Returns
-        -------
-        Tensor
-            static code,
-            shape (batch_size, num_static_features)
-
-        Tensor
-            dynamic code,
-            shape (batch_size, sequence_length, num_dynamic_features)
-        """
-        inputs = self._assemble_inputs(
-            F, target, static_features, dynamic_features
-        )
-        dynamic_code = self.rnn(inputs)
-
-        # using the last state as the static code,
-        # but not working as well as the concat of all the previous states
-        static_code = F.squeeze(
-            F.slice_axis(dynamic_code, axis=1, begin=-1, end=None), axis=1
-        )
-
-        return static_code, dynamic_code
diff --git a/src/gluonts/dataset/field_names.py b/src/gluonts/dataset/field_names.py
index b7419de9dd..342c397a17 100644
--- a/src/gluonts/dataset/field_names.py
+++ b/src/gluonts/dataset/field_names.py
@@ -28,6 +28,8 @@ class FieldName:
     FEAT_DYNAMIC_CAT = "feat_dynamic_cat"
     FEAT_DYNAMIC_REAL = "feat_dynamic_real"
 
+    # TODO: maybe add FEAT_DYNAMIC = "feat_dynamic"
+
     FEAT_TIME = "time_feat"
     FEAT_CONST = "feat_dynamic_const"
     FEAT_AGE = "feat_dynamic_age"
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 2b086a1acd..9cad2d139f 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -27,6 +27,7 @@
 from gluonts.model.forecast_generator import QuantileForecastGenerator
 from gluonts.support.util import copy_parameters
 from gluonts.trainer import Trainer
+from gluonts.time_feature import time_features_from_frequency_str
 from gluonts.transform import (
     AddAgeFeature,
     AddTimeFeatures,
@@ -35,15 +36,15 @@
     Transformation,
     VstackFeatures,
     RenameFields,
-    SetField,
+    AddConstFeature,
     RemoveFields,
 )
 
 # Relative imports
-from gluonts.time_feature import time_features_from_frequency_str
 from ._forking_network import (
-    ForkingSeq2SeqNetwork,
     ForkingSeq2SeqNetworkBase,
+    ForkingSeq2SeqTrainingNetwork,
+    ForkingSeq2SeqPredictionNetwork,
 )
 from ._transform import ForkingSequenceSplitter
 
@@ -97,7 +98,7 @@ def __init__(
         quantile_output: QuantileOutput,
         freq: str,
         prediction_length: int,
-        use_dynamic_feat: bool = False,
+        use_feat_dynamic_real: bool = False,
         add_time_feature: bool = False,
         add_age_feature: bool = False,
         context_length: Optional[int] = None,
@@ -124,17 +125,21 @@ def __init__(
         self.encoder = encoder
         self.decoder = decoder
         self.quantile_output = quantile_output
-        self.prediction_length = prediction_length
         self.freq = freq
-        self.use_dynamic_feat = use_dynamic_feat
+        self.prediction_length = prediction_length
         self.context_length = (
-            context_length if context_length is not None else prediction_length
+            context_length
+            if context_length is not None
+            else self.prediction_length
         )
+        self.use_feat_dynamic_real = use_feat_dynamic_real
         self.add_time_feature = add_time_feature
         self.add_age_feature = add_age_feature
+        self.use_dynamic_feat = (
+            use_feat_dynamic_real or add_age_feature or add_time_feature
+        )
 
         # self.use_feat_static_cat = use_feat_static_cat
-        # self.use_feat_dynamic_real = use_feat_dynamic_real
         # self.cardinality = (
         #     cardinality if cardinality and use_feat_static_cat else [1]
         # )
@@ -147,17 +152,14 @@ def __init__(
         # TODO: refactor this variable name: dynamic_network, in fact it
         #  is not even necessary as is, because this is how use_dynamic_feat was
         #  set in MQCNNEstimator and otherwise its not used, i.e. False
-        # is target only network or not?
-        self.use_dynamic_real = (
-            use_dynamic_feat or add_time_feature or add_age_feature
-        )
-
-        print(f"use_dynamic_network: {self.use_dynamic_real}")
+        # # is target only network or not?
+        # self.use_dynamic_real = (
+        #     use_dynamic_feat or add_time_feature or add_age_feature or True  # TODO: fix this
+        # )
+        #
+        # print(f"use_dynamic_network: {self.use_dynamic_real}")
 
     def create_transformation(self) -> Transformation:
-        # remove_field_names = [FieldName.FEAT_DYNAMIC_CAT]
-        # if not self.use_feat_static_real:
-        #     remove_field_names.append(FieldName.FEAT_STATIC_REAL)
         # if not self.use_feat_dynamic_real:
         #     remove_field_names.append(FieldName.FEAT_DYNAMIC_REAL)
 
@@ -186,9 +188,31 @@ def create_transformation(self) -> Transformation:
             )
             dynamic_feat_fields.append(FieldName.FEAT_AGE)
 
-        if self.use_dynamic_feat:
+        # TODO: there may have been a bug here
+        if self.use_feat_dynamic_real:
+            print("NO IM HERE")
             dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
+        else:
+            print("IM HERE")
+            chain.append(
+                RemoveFields(field_names=[FieldName.FEAT_DYNAMIC_REAL])
+            )
+
+        # we need to make sure that there is always some dynamic input
+        # we will however disregard it in the hybrid forward
+        if len(dynamic_feat_fields) == 0:
+            chain.append(
+                AddConstFeature(
+                    target_field=FieldName.TARGET,
+                    output_field=FieldName.FEAT_CONST,
+                    pred_length=self.prediction_length,
+                ),
+            )
+            dynamic_feat_fields.append(FieldName.FEAT_CONST)
 
+        # now we map all the dynamic input onto FieldName.FEAT_DYNAMIC_REAL
+        # TODO: change the field from FieldName.FEAT_DYNAMIC_REAL to FieldName.FEAT_TIME for consistency with deepAR
+        #  or to FieldName.FEAT_DYNAMIC, which would have to be added
         if len(dynamic_feat_fields) > 1:
             chain.append(
                 VstackFeatures(
@@ -200,35 +224,17 @@ def create_transformation(self) -> Transformation:
             len(dynamic_feat_fields) == 1
             and FieldName.FEAT_DYNAMIC_REAL not in dynamic_feat_fields
         ):
+            print("ONLY HAVE DYNAMIC REAL")
             chain.append(
                 RenameFields(
                     {dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL}
                 )
             )
-
-        # TODO: current problem: cannot have no input, if some input provided, because the decoder will not
-        #  accept input, however, the batches contain input, and python complains that
-        #  it cannot map something to nothing?
-
-        # if dynamic_feat_fields:
-        #     chain.append(
-        #         VstackFeatures(
-        #             output_field=FieldName.FEAT_DYNAMIC_REAL,
-        #             input_fields=dynamic_feat_fields,
-        #         )
-        #     )
-        # else:
-        #     # Unfortunately we always need to pass something.
-        #     # Passing a constant does not have an effect on performance and essentially acts as a bias term.
-        #     SetField(
-        #         output_field=FieldName.FEAT_DYNAMIC_REAL, value=[0.0]
-        #     )
-        #     dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
-
-        # So far the decoder only uses dynamic real
-        decoder_field = (
-            [FieldName.FEAT_DYNAMIC_REAL] if dynamic_feat_fields else []
-        )
+        else:
+            print(
+                "IM NAUGHTY?: ",
+                FieldName.FEAT_DYNAMIC_REAL in dynamic_feat_fields,
+            )
 
         chain.append(
             # because of how the forking decoder works, every time step
@@ -237,20 +243,22 @@ def create_transformation(self) -> Transformation:
                 train_sampler=TestSplitSampler(),
                 enc_len=self.context_length,
                 dec_len=self.prediction_length,
-                encoder_series_fields=decoder_field,
+                encoder_series_fields=[
+                    FieldName.FEAT_DYNAMIC_REAL
+                ],  # TODO: later add categorical too
             ),
         )
 
         return Chain(chain)
 
     def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
-        return ForkingSeq2SeqNetwork(
+        return ForkingSeq2SeqTrainingNetwork(
             encoder=self.encoder,
             enc2dec=PassThroughEnc2Dec(),
             decoder=self.decoder,
             quantile_output=self.quantile_output,
-            use_dynamic_real=self.use_dynamic_real,
-        ).get_training_network()
+            use_dynamic_feat=self.use_dynamic_feat,
+        )
 
     def create_predictor(
         self,
@@ -263,13 +271,15 @@ def create_predictor(
             for quantile in self.quantile_output.quantiles
         ]
 
-        prediction_network = ForkingSeq2SeqNetwork(
+        print("TOTALLY FINE THUS FAR P1")
+
+        prediction_network = ForkingSeq2SeqPredictionNetwork(
             encoder=trained_network.encoder,
             enc2dec=trained_network.enc2dec,
             decoder=trained_network.decoder,
             quantile_output=trained_network.quantile_output,
-            use_dynamic_real=self.use_dynamic_real,
-        ).get_prediction_network()
+            use_dynamic_feat=trained_network.use_dynamic_feat,
+        )
 
         copy_parameters(trained_network, prediction_network)
 
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index c01ef4c3cf..f8eb188c7d 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -50,6 +50,8 @@ def __init__(
         enc2dec: Seq2SeqEnc2Dec,
         decoder: Seq2SeqDecoder,
         quantile_output: QuantileOutput,
+        use_dynamic_feat: bool,
+        # use_static_feat: bool,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -58,70 +60,14 @@ def __init__(
         self.enc2dec = enc2dec
         self.decoder = decoder
         self.quantile_output = quantile_output
+        self.use_dynamic_feat = use_dynamic_feat
+        # self.use_static_feat = use_static_feat
 
         with self.name_scope():
             self.quantile_proj = quantile_output.get_quantile_proj()
             self.loss = quantile_output.get_loss()
 
 
-# TODO: THIS SHOULD NOT EXIST, the if else logic should be handled in
-#  the _forking_estimator.py, and possible assertions too
-class ForkingSeq2SeqNetwork:
-    @validated()
-    def __init__(
-        self,
-        encoder: Seq2SeqEncoder,
-        enc2dec: Seq2SeqEnc2Dec,
-        decoder: Seq2SeqDecoder,
-        quantile_output: QuantileOutput,
-        use_dynamic_real: bool = False,
-        use_static_cat: bool = False,
-        **kwargs,
-    ) -> None:
-        self.encoder = encoder
-        self.enc2dec = enc2dec
-        self.decoder = decoder
-        self.quantile_output = quantile_output
-
-        self.use_dynamic_real = use_dynamic_real
-        self.use_static_cat = use_static_cat
-
-        # TODO: add this feature:
-        assert self.use_static_cat is False
-
-    def get_training_network(self) -> ForkingSeq2SeqNetworkBase:
-        if self.use_dynamic_real is False:
-            return ForkingSeq2SeqTargetTrainingNetwork(
-                encoder=self.encoder,
-                enc2dec=self.enc2dec,
-                decoder=self.decoder,
-                quantile_output=self.quantile_output,
-            )
-        else:
-            return ForkingSeq2SeqTrainingNetwork(
-                encoder=self.encoder,
-                enc2dec=self.enc2dec,
-                decoder=self.decoder,
-                quantile_output=self.quantile_output,
-            )
-
-    def get_prediction_network(self) -> ForkingSeq2SeqNetworkBase:
-        if self.use_dynamic_real is False:
-            return ForkingSeq2SeqTargetPredictionNetwork(
-                encoder=self.encoder,
-                enc2dec=self.enc2dec,
-                decoder=self.decoder,
-                quantile_output=self.quantile_output,
-            )
-        else:
-            return ForkingSeq2SeqPredictionNetwork(
-                encoder=self.encoder,
-                enc2dec=self.enc2dec,
-                decoder=self.decoder,
-                quantile_output=self.quantile_output,
-            )
-
-
 # TODO: figure out whether we need 2 classes each, in fact we would need 4 each,
 #  if adding categorical with this technique, does not seem reasonable
 class ForkingSeq2SeqTrainingNetwork(ForkingSeq2SeqNetworkBase):
@@ -148,10 +94,12 @@ def hybrid_forward(
         loss with shape (FIXME, FIXME)
         """
 
-        # FIXME: can we factor out a common prefix in the base network?
+        print("TOTALLY FINE SO FAR")
+
         feat_static_real = F.zeros(shape=(1,))
-        # TODO: THIS IS OVERWRITING THE ARGUMENT?!?! (REMOVING IT makes add time and age feature work):
-        # past_feat_dynamic_real = F.zeros(shape=(1,))
+        # TODO: Required to be commented out for shape inference...
+        # if not self.use_dynamic_feat:
+        #     past_feat_dynamic_real = F.zeros(shape=(1,))
         future_feat_dynamic_real = F.zeros(shape=(1,))
 
         # arguments: target, static_features, dynamic_features
@@ -159,6 +107,8 @@ def hybrid_forward(
             past_target, feat_static_real, past_feat_dynamic_real
         )
 
+        print("TOTALLY FINE SO FAR 2")
+
         # arguments: encoder_output_static, encoder_output_dynamic, future_features
         # TODO: figure out how future_features is supposed to be used: since no distinction
         #  between dynamic and static anymore (shape is (N, T, C) suggesting dynamic feature)
@@ -169,7 +119,11 @@ def hybrid_forward(
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
         dec_dist_output = self.quantile_proj(dec_output)
 
+        print("TOTALLY FINE SO FAR 3")
+
         loss = self.loss(future_target, dec_dist_output)
+
+        print("TOTALLY FINE SO FAR 4")
         return loss.mean(axis=1)
 
 
@@ -191,105 +145,31 @@ def hybrid_forward(
         prediction tensor with shape (FIXME, FIXME)
         """
 
-        # FIXME: can we factor out a common prefix in the base network?
+        print("TOTALLY FINE SO FAR 5")
 
         feat_static_real = F.zeros(shape=(1,))
+        # TODO: Required to be commented out for shape inference...
+        # if not self.use_dynamic_feat:
+        #     past_feat_dynamic_real = F.zeros(shape=(1,))
         future_feat_dynamic_real = F.zeros(shape=(1,))
 
-        enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
-        )
-
-        enc_output_static = (
-            F.zeros(shape=(1,))
-            if enc_output_static is None
-            else enc_output_static
-        )
-
-        dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real,
-        )
-
-        dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
-        fcst_output = F.slice_axis(dec_output, axis=1, begin=-1, end=None)
-        fcst_output = F.squeeze(fcst_output, axis=1)
-
-        predictions = self.quantile_proj(fcst_output).swapaxes(2, 1)
-        return predictions
-
-
-class ForkingSeq2SeqTargetTrainingNetwork(ForkingSeq2SeqNetworkBase):
-    # noinspection PyMethodOverriding
-    def hybrid_forward(
-        self, F, past_target: Tensor, future_target: Tensor
-    ) -> Tensor:
-        """
-        Parameters
-        ----------
-        F: mx.symbol or mx.ndarray
-            Gluon function space
-        past_target: Tensor
-            FIXME
-        future_target: Tensor
-            shape (num_ts, encoder_length, 1) FIXME
-
-        Returns
-        -------
-        loss with shape (FIXME, FIXME)
-        """
-
-        feat_static_real = F.zeros(shape=(1,))
-        past_feat_dynamic_real = F.zeros(shape=(1,))
-        future_feat_dynamic_real = F.zeros(shape=(1,))
-
-        enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
-        )
-
-        dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
-        )
-
-        dec_output = self.decoder(dec_input_dynamic, dec_input_static)
-        dec_dist_output = self.quantile_proj(dec_output)
-
-        loss = self.loss(future_target, dec_dist_output)
-        return loss.mean(axis=1)
-
-
-class ForkingSeq2SeqTargetPredictionNetwork(ForkingSeq2SeqNetworkBase):
-    # noinspection PyMethodOverriding
-    def hybrid_forward(self, F, past_target: Tensor) -> Tensor:
-        """
-        Parameters
-        ----------
-        F: mx.symbol or mx.ndarray
-            Gluon function space
-        past_target: Tensor
-            FIXME
-
-        Returns
-        -------
-        prediction tensor with shape (FIXME, FIXME)
-        """
-
-        # FIXME: can we factor out a common prefix in the base network?
-        feat_static_real = F.zeros(shape=(1,))
-        past_feat_dynamic_real = F.zeros(shape=(1,))
-        future_feat_dynamic_real = F.zeros(shape=(1,))
+        print("TOTALLY FINE SO FAR 6")
 
         enc_output_static, enc_output_dynamic = self.encoder(
             past_target, feat_static_real, past_feat_dynamic_real
         )
 
+        # TODO: figure out WHY IS THIS NEEDED HERE?
         enc_output_static = (
             F.zeros(shape=(1,))
             if enc_output_static is None
             else enc_output_static
         )
 
+        print("TOTALLY FINE SO FAR 7")
+
         dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real,
         )
 
         dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 96cd710791..34f8136352 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -81,17 +81,13 @@ def __init__(
             [d > 0 for d in decoder_mlp_dim_seq]
         ), "Elements of `mlp_hidden_dimension_seq` should be > 0"
 
-        use_dynamic_feat = (
-            use_feat_dynamic_real or add_age_feature or add_time_feature
-        )
-
         encoder = HierarchicalCausalConv1DEncoder(
             dilation_seq=dilation_seq,
             kernel_size_seq=kernel_size_seq,
             channels_seq=channels_seq,
             use_residual=use_residual,
-            use_dynamic_feat=use_dynamic_feat,
-            # use_static_feat=use_feat_static_cat,
+            use_static_feat=False,
+            use_dynamic_feat=True,
             prefix="encoder_",
         )
 
@@ -111,7 +107,7 @@ def __init__(
             freq=freq,
             prediction_length=prediction_length,
             context_length=context_length,
-            use_dynamic_feat=use_dynamic_feat,
+            use_feat_dynamic_real=use_feat_dynamic_real,
             add_time_feature=add_time_feature,
             add_age_feature=add_age_feature,
             trainer=trainer,
@@ -159,6 +155,8 @@ def __init__(
             num_layers=1,
             bidirectional=True,
             prefix="encoder_",
+            use_static_feat=False,
+            use_dynamic_feat=True,
         )
 
         decoder = ForkingMLPDecoder(
diff --git a/src/gluonts/model/seq2seq/_seq2seq_estimator.py b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
index 2b3211f161..c50ae96bb4 100644
--- a/src/gluonts/model/seq2seq/_seq2seq_estimator.py
+++ b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
@@ -23,9 +23,9 @@
 from gluonts.block.enc2dec import PassThroughEnc2Dec
 from gluonts.block.encoder import (
     HierarchicalCausalConv1DEncoder,
-    RNNCovariateEncoder,
     MLPEncoder,
     Seq2SeqEncoder,
+    RNNEncoder,
 )
 from gluonts.block.feature import FeatureEmbedder
 from gluonts.block.quantile_output import QuantileOutput
@@ -238,11 +238,13 @@ def __init__(
         trainer: Trainer = Trainer(),
         num_parallel_samples: int = 100,
     ) -> None:
-        encoder = RNNCovariateEncoder(
+        encoder = RNNEncoder(
             mode=encoder_rnn_model,
             hidden_size=encoder_rnn_num_hidden,
             num_layers=encoder_rnn_layer,
             bidirectional=encoder_rnn_bidirectional,
+            use_static_feat=True,
+            use_dynamic_feat=True,
         )
         super(RNN2QRForecaster, self).__init__(
             freq=freq,
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index 1943e1d0c4..bab1e878d4 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -66,9 +66,11 @@ def __init__(
         self.forecast_start_out = forecast_start_output_field
         self.decoder_series_fields = decoder_series_fields
 
+    # TODO: make use of these
     def _past(self, col_name):
         return f"past_{col_name}"
 
+    # TODO: make use of these
     def _future(self, col_name):
         return f"future_{col_name}"
 
@@ -140,11 +142,13 @@ def flatmap_transform(
 
                     out[self._future(ts_field)] = forking_dec_field
 
+            # So far pad indicator not in use
             pad_indicator = np.zeros(self.enc_len)
             pad_length = max(0, self.enc_len - sampling_idx)
             pad_indicator[:pad_length] = True
             out[f"past_{self.is_pad_out}"] = pad_indicator
 
+            # So far pad forecast_start_out not in use
             out[self.forecast_start_out] = shift_timestamp(
                 out[self.start_in], sampling_idx
             )
diff --git a/src/gluonts/transform/field.py b/src/gluonts/transform/field.py
index 860204c00b..2e39320c22 100644
--- a/src/gluonts/transform/field.py
+++ b/src/gluonts/transform/field.py
@@ -39,11 +39,11 @@ def __init__(self, mapping: Dict[str, str]) -> None:
 
     def transform(self, data: DataEntry):
         for key, new_key in self.mapping.items():
-            if key not in data:
-                continue
-            assert new_key not in data
-            data[new_key] = data[key]
-            del data[key]
+            if key in data:
+                # no implicit overriding
+                assert new_key not in data
+                data[new_key] = data[key]
+                del data[key]
         return data
 
 
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index 4502cf5969..385020c41f 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -69,6 +69,7 @@ def test_mqcnn_covariate_smoke_test(
         "use_feat_dynamic_real": use_feat_dynamic_real,
         "add_time_feature": add_time_feature,
         "add_age_feature": add_age_feature,
+        "hybridize": True,
     }
 
     dataset_train, dataset_test = make_dummy_datasets_with_features(

From ae683df9b2b885c3800a9c32f64d35ac5dde5fad Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Fri, 17 Apr 2020 23:27:42 +0200
Subject: [PATCH 19/44] Removed print

---
 src/gluonts/model/seq2seq/_forking_estimator.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 9cad2d139f..5feb9fd190 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -230,11 +230,6 @@ def create_transformation(self) -> Transformation:
                     {dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL}
                 )
             )
-        else:
-            print(
-                "IM NAUGHTY?: ",
-                FieldName.FEAT_DYNAMIC_REAL in dynamic_feat_fields,
-            )
 
         chain.append(
             # because of how the forking decoder works, every time step

From bd733d84e1d1f428472843d57fd204da107b7e08 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Sat, 18 Apr 2020 00:15:30 +0200
Subject: [PATCH 20/44] Ensuring backward compatibility, some refactoring.

---
 src/gluonts/block/encoder.py                  | 65 +++++++--------
 .../model/seq2seq/_forking_estimator.py       | 19 -----
 src/gluonts/model/seq2seq/_forking_network.py | 81 +++++++------------
 src/gluonts/model/seq2seq/_transform.py       |  4 +-
 4 files changed, 57 insertions(+), 112 deletions(-)

diff --git a/src/gluonts/block/encoder.py b/src/gluonts/block/encoder.py
index f3f7f23ce2..a1ed3dd484 100644
--- a/src/gluonts/block/encoder.py
+++ b/src/gluonts/block/encoder.py
@@ -43,30 +43,24 @@ def hybrid_forward(
         """
         Parameters
         ----------
-
         F:
             A module that can either refer to the Symbol API or the NDArray
             API in MXNet.
-
         target
             target time series,
             shape (batch_size, sequence_length)
-
         static_features
             static features,
             shape (batch_size, num_static_features)
-
         dynamic_features
             dynamic_features,
             shape (batch_size, sequence_length, num_dynamic_features)
 
-
         Returns
         -------
         Tensor
             static code,
             shape (batch_size, num_static_features)
-
         Tensor
             dynamic code,
             shape (batch_size, sequence_length, num_dynamic_features)
@@ -89,15 +83,12 @@ def _assemble_inputs(
         F
             A module that can either refer to the Symbol API or the NDArray
             API in MXNet.
-
         target
             target time series,
             shape (batch_size, sequence_length, 1)
-
         static_features
             static features,
             shape (batch_size, num_static_features)
-
         dynamic_features
             dynamic_features,
             shape (batch_size, sequence_length, num_dynamic_features)
@@ -108,7 +99,6 @@ def _assemble_inputs(
             combined features,
             shape (batch_size, sequence_length,
                    num_static_features + num_dynamic_features + 1)
-
         """
 
         helper_ones = F.ones_like(target)  # Ones of (N, T, 1)
@@ -133,18 +123,16 @@ class HierarchicalCausalConv1DEncoder(Seq2SeqEncoder):
     ----------
     dilation_seq
         dilation for each convolution in the stack.
-
     kernel_size_seq
         kernel size for each convolution in the stack.
-
     channels_seq
         number of channels for each convolution in the stack.
-
     use_residual
         flag to toggle using residual connections.
-
-    use_covariates
-        flag to toggle whether to use coveriates as input to the encoder
+    use_static_feat
+        flag to toggle whether to use use_static_feat as input to the encoder
+    use_dynamic_feat
+        flag to toggle whether to use use_static_feat as input to the encoder
     """
 
     @validated()
@@ -196,29 +184,23 @@ def hybrid_forward(
         """
         Parameters
         ----------
-
         F
             A module that can either refer to the Symbol API or the NDArray
             API in MXNet.
-
         target
             target time series,
             shape (batch_size, sequence_length, 1)
-
         static_features
             static features,
             shape (batch_size, num_static_features)
-
         dynamic_features
             dynamic_features,
             shape (batch_size, sequence_length, num_dynamic_features)
-
         Returns
         -------
         Tensor
             static code,
             shape (batch_size, num_static_features)
-
         Tensor
             dynamic code,
             shape (batch_size, sequence_length, num_dynamic_features)
@@ -236,8 +218,6 @@ def hybrid_forward(
         else:
             inputs = target
 
-        print("Been here done that.")
-
         # NTC -> NCT (or NCW)
         ct = inputs.swapaxes(1, 2)
         ct = self.cnn(ct)
@@ -251,8 +231,6 @@ def hybrid_forward(
         static_code = F.slice_axis(ct, axis=1, begin=-1, end=None)
         static_code = F.squeeze(static_code, axis=1)
 
-        print("Been here done that. 2.")
-
         return static_code, ct
 
 
@@ -265,15 +243,16 @@ class RNNEncoder(Seq2SeqEncoder):
     mode
         type of the RNN. Can be either: rnn_relu (RNN with relu activation),
         rnn_tanh, (RNN with tanh activation), lstm or gru.
-
     hidden_size
         number of units per hidden layer.
-
     num_layers
         number of hidden layers.
-
     bidirectional
         toggle use of bi-directional RNN as encoder.
+    use_static_feat
+        flag to toggle whether to use use_static_feat as input to the encoder
+    use_dynamic_feat
+        flag to toggle whether to use use_static_feat as input to the encoder
     """
 
     @validated()
@@ -291,6 +270,7 @@ def __init__(
         assert hidden_size > 0, "`hidden_size` value must be greater than zero"
 
         super().__init__(**kwargs)
+
         self.mode = mode
         self.hidden_size = hidden_size
         self.num_layers = num_layers
@@ -314,15 +294,12 @@ def hybrid_forward(
         F
             A module that can either refer to the Symbol API or the NDArray
             API in MXNet.
-
         target
             target time series,
             shape (batch_size, sequence_length, 1)
-
         static_features
             static features,
             shape (batch_size, num_static_features)
-
         dynamic_features
             dynamic_features,
             shape (batch_size, sequence_length, num_dynamic_features)
@@ -332,7 +309,6 @@ def hybrid_forward(
         Tensor
             static code,
             shape (batch_size, num_static_features)
-
         Tensor
             dynamic code,
             shape (batch_size, sequence_length, num_dynamic_features)
@@ -383,15 +359,12 @@ def hybrid_forward(
         F
             A module that can either refer to the Symbol API or the NDArray
             API in MXNet.
-
         target
             target time series,
             shape (batch_size, sequence_length)
-
         static_features
             static features,
             shape (batch_size, num_static_features)
-
         dynamic_features
             dynamic_features,
             shape (batch_size, sequence_length, num_dynamic_features)
@@ -401,7 +374,6 @@ def hybrid_forward(
         Tensor
             static code,
             shape (batch_size, num_static_features)
-
         Tensor
             dynamic code,
             shape (batch_size, sequence_length, num_dynamic_features)
@@ -413,3 +385,22 @@ def hybrid_forward(
         static_code = self.model(inputs)
         dynamic_code = F.zeros_like(target).expand_dims(2)
         return static_code, dynamic_code
+
+
+class RNNCovariateEncoder(RNNEncoder):
+    """
+    Deprecated class only for compatibility; use RNNEncoder instead.
+    """
+
+    @validated()
+    def __init__(
+        self,
+        use_static_feat: bool = True,
+        use_dynamic_feat: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            use_static_feat=use_static_feat,
+            use_dynamic_feat=use_dynamic_feat,
+            **kwargs,
+        )
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 5feb9fd190..11625e9d37 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -149,20 +149,7 @@ def __init__(
         #     else [min(50, (cat + 1) // 2) for cat in self.cardinality]
         # )
 
-        # TODO: refactor this variable name: dynamic_network, in fact it
-        #  is not even necessary as is, because this is how use_dynamic_feat was
-        #  set in MQCNNEstimator and otherwise its not used, i.e. False
-        # # is target only network or not?
-        # self.use_dynamic_real = (
-        #     use_dynamic_feat or add_time_feature or add_age_feature or True  # TODO: fix this
-        # )
-        #
-        # print(f"use_dynamic_network: {self.use_dynamic_real}")
-
     def create_transformation(self) -> Transformation:
-        # if not self.use_feat_dynamic_real:
-        #     remove_field_names.append(FieldName.FEAT_DYNAMIC_REAL)
-
         chain = []
         dynamic_feat_fields = []
 
@@ -188,12 +175,9 @@ def create_transformation(self) -> Transformation:
             )
             dynamic_feat_fields.append(FieldName.FEAT_AGE)
 
-        # TODO: there may have been a bug here
         if self.use_feat_dynamic_real:
-            print("NO IM HERE")
             dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
         else:
-            print("IM HERE")
             chain.append(
                 RemoveFields(field_names=[FieldName.FEAT_DYNAMIC_REAL])
             )
@@ -224,7 +208,6 @@ def create_transformation(self) -> Transformation:
             len(dynamic_feat_fields) == 1
             and FieldName.FEAT_DYNAMIC_REAL not in dynamic_feat_fields
         ):
-            print("ONLY HAVE DYNAMIC REAL")
             chain.append(
                 RenameFields(
                     {dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL}
@@ -266,8 +249,6 @@ def create_predictor(
             for quantile in self.quantile_output.quantiles
         ]
 
-        print("TOTALLY FINE THUS FAR P1")
-
         prediction_network = ForkingSeq2SeqPredictionNetwork(
             encoder=trained_network.encoder,
             enc2dec=trained_network.enc2dec,
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index f8eb188c7d..95d1cba6df 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -67,6 +67,28 @@ def __init__(
             self.quantile_proj = quantile_output.get_quantile_proj()
             self.loss = quantile_output.get_loss()
 
+    # this method connects the sub-networks and returns the decoder output
+    def get_decoder_network_output(
+        self, F, past_target: Tensor, past_feat_dynamic_real: Tensor
+    ) -> Tensor:
+        feat_static_real = F.zeros(shape=(1,))
+        future_feat_dynamic_real = F.zeros(shape=(1,))
+
+        # arguments: target, static_features, dynamic_features
+        enc_output_static, enc_output_dynamic = self.encoder(
+            past_target, feat_static_real, past_feat_dynamic_real
+        )
+
+        # arguments: encoder_output_static, encoder_output_dynamic, future_features
+        dec_input_static, dec_input_dynamic, _ = self.enc2dec(
+            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
+        )
+
+        # arguments: dynamic_input, static_input
+        dec_output = self.decoder(dec_input_dynamic, dec_input_static)
+
+        return dec_output
+
 
 # TODO: figure out whether we need 2 classes each, in fact we would need 4 each,
 #  if adding categorical with this technique, does not seem reasonable
@@ -93,37 +115,13 @@ def hybrid_forward(
         -------
         loss with shape (FIXME, FIXME)
         """
-
-        print("TOTALLY FINE SO FAR")
-
-        feat_static_real = F.zeros(shape=(1,))
-        # TODO: Required to be commented out for shape inference...
-        # if not self.use_dynamic_feat:
-        #     past_feat_dynamic_real = F.zeros(shape=(1,))
-        future_feat_dynamic_real = F.zeros(shape=(1,))
-
-        # arguments: target, static_features, dynamic_features
-        enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
-        )
-
-        print("TOTALLY FINE SO FAR 2")
-
-        # arguments: encoder_output_static, encoder_output_dynamic, future_features
-        # TODO: figure out how future_features is supposed to be used: since no distinction
-        #  between dynamic and static anymore (shape is (N, T, C) suggesting dynamic feature)
-        dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
+        dec_output = self.get_decoder_network_output(
+            F, past_target, past_feat_dynamic_real
         )
 
-        dec_output = self.decoder(dec_input_dynamic, dec_input_static)
         dec_dist_output = self.quantile_proj(dec_output)
-
-        print("TOTALLY FINE SO FAR 3")
-
         loss = self.loss(future_target, dec_dist_output)
 
-        print("TOTALLY FINE SO FAR 4")
         return loss.mean(axis=1)
 
 
@@ -144,37 +142,12 @@ def hybrid_forward(
         -------
         prediction tensor with shape (FIXME, FIXME)
         """
-
-        print("TOTALLY FINE SO FAR 5")
-
-        feat_static_real = F.zeros(shape=(1,))
-        # TODO: Required to be commented out for shape inference...
-        # if not self.use_dynamic_feat:
-        #     past_feat_dynamic_real = F.zeros(shape=(1,))
-        future_feat_dynamic_real = F.zeros(shape=(1,))
-
-        print("TOTALLY FINE SO FAR 6")
-
-        enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
-        )
-
-        # TODO: figure out WHY IS THIS NEEDED HERE?
-        enc_output_static = (
-            F.zeros(shape=(1,))
-            if enc_output_static is None
-            else enc_output_static
-        )
-
-        print("TOTALLY FINE SO FAR 7")
-
-        dec_inp_static, dec_inp_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real,
+        dec_output = self.get_decoder_network_output(
+            F, past_target, past_feat_dynamic_real
         )
 
-        dec_output = self.decoder(dec_inp_dynamic, dec_inp_static)
         fcst_output = F.slice_axis(dec_output, axis=1, begin=-1, end=None)
         fcst_output = F.squeeze(fcst_output, axis=1)
-
         predictions = self.quantile_proj(fcst_output).swapaxes(2, 1)
+
         return predictions
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index bab1e878d4..aede54faa5 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -121,7 +121,7 @@ def flatmap_transform(
                 # if we have less than enc_len values, pad_left with 0
                 past_piece = pad_to_size(slice, self.enc_len)
 
-                out[f"past_{ts_field}"] = past_piece.transpose()
+                out[self._past(ts_field)] = past_piece.transpose()
 
                 # in prediction mode, don't provide decode-values
                 if not is_train and ts_field == self.target_in:
@@ -146,7 +146,7 @@ def flatmap_transform(
             pad_indicator = np.zeros(self.enc_len)
             pad_length = max(0, self.enc_len - sampling_idx)
             pad_indicator[:pad_length] = True
-            out[f"past_{self.is_pad_out}"] = pad_indicator
+            out[self._past(self.is_pad_out)] = pad_indicator
 
             # So far pad forecast_start_out not in use
             out[self.forecast_start_out] = shift_timestamp(

From b17c66309ffebc1b7a4b2180e8ea6561203173e7 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Sat, 18 Apr 2020 00:55:55 +0200
Subject: [PATCH 21/44] Mainly argument refactoring, but also some legibility
 refactoring.

---
 src/gluonts/block/encoder.py                  |   4 +-
 src/gluonts/dataset/field_names.py            |   2 +-
 .../model/seq2seq/_forking_estimator.py       |  28 +++--
 src/gluonts/model/seq2seq/_forking_network.py |  16 +--
 .../model/seq2seq/_mq_dnn_estimator.py        | 110 ++++++++++++------
 5 files changed, 94 insertions(+), 66 deletions(-)

diff --git a/src/gluonts/block/encoder.py b/src/gluonts/block/encoder.py
index a1ed3dd484..76e1cd7fa9 100644
--- a/src/gluonts/block/encoder.py
+++ b/src/gluonts/block/encoder.py
@@ -132,7 +132,7 @@ class HierarchicalCausalConv1DEncoder(Seq2SeqEncoder):
     use_static_feat
         flag to toggle whether to use use_static_feat as input to the encoder
     use_dynamic_feat
-        flag to toggle whether to use use_static_feat as input to the encoder
+        flag to toggle whether to use use_dynamic_feat as input to the encoder
     """
 
     @validated()
@@ -252,7 +252,7 @@ class RNNEncoder(Seq2SeqEncoder):
     use_static_feat
         flag to toggle whether to use use_static_feat as input to the encoder
     use_dynamic_feat
-        flag to toggle whether to use use_static_feat as input to the encoder
+        flag to toggle whether to use use_dynamic_feat as input to the encoder
     """
 
     @validated()
diff --git a/src/gluonts/dataset/field_names.py b/src/gluonts/dataset/field_names.py
index 342c397a17..0e0a6ff7f8 100644
--- a/src/gluonts/dataset/field_names.py
+++ b/src/gluonts/dataset/field_names.py
@@ -28,7 +28,7 @@ class FieldName:
     FEAT_DYNAMIC_CAT = "feat_dynamic_cat"
     FEAT_DYNAMIC_REAL = "feat_dynamic_real"
 
-    # TODO: maybe add FEAT_DYNAMIC = "feat_dynamic"
+    FEAT_DYNAMIC = "feat_dynamic"
 
     FEAT_TIME = "time_feat"
     FEAT_CONST = "feat_dynamic_const"
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 11625e9d37..3fb9a422e3 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -84,6 +84,13 @@ class ForkingSeq2SeqEstimator(GluonEstimator):
         frequency of the time series
     prediction_length
         length of the decoding sequence
+    use_feat_dynamic_real
+        Whether to use the ``feat_dynamic_real`` field from the data (default: False)
+    add_time_feature
+        Adds a set of time features.
+    add_age_feature
+        Adds an age feature.
+        The age feature starts with a small value at the start of the time series and grows over time.
     context_length
         length of the encoding sequence (prediction_length is used if None)
     trainer
@@ -98,10 +105,10 @@ def __init__(
         quantile_output: QuantileOutput,
         freq: str,
         prediction_length: int,
+        context_length: Optional[int] = None,
         use_feat_dynamic_real: bool = False,
         add_time_feature: bool = False,
         add_age_feature: bool = False,
-        context_length: Optional[int] = None,
         trainer: Trainer = Trainer(),
     ) -> None:
         super().__init__(trainer=trainer)
@@ -194,24 +201,17 @@ def create_transformation(self) -> Transformation:
             )
             dynamic_feat_fields.append(FieldName.FEAT_CONST)
 
-        # now we map all the dynamic input onto FieldName.FEAT_DYNAMIC_REAL
-        # TODO: change the field from FieldName.FEAT_DYNAMIC_REAL to FieldName.FEAT_TIME for consistency with deepAR
-        #  or to FieldName.FEAT_DYNAMIC, which would have to be added
+        # now we map all the dynamic input onto FieldName.FEAT_DYNAMIC
         if len(dynamic_feat_fields) > 1:
             chain.append(
                 VstackFeatures(
-                    output_field=FieldName.FEAT_DYNAMIC_REAL,
+                    output_field=FieldName.FEAT_DYNAMIC,
                     input_fields=dynamic_feat_fields,
                 )
             )
-        elif (
-            len(dynamic_feat_fields) == 1
-            and FieldName.FEAT_DYNAMIC_REAL not in dynamic_feat_fields
-        ):
+        elif len(dynamic_feat_fields) == 1:
             chain.append(
-                RenameFields(
-                    {dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC_REAL}
-                )
+                RenameFields({dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC})
             )
 
         chain.append(
@@ -222,7 +222,7 @@ def create_transformation(self) -> Transformation:
                 enc_len=self.context_length,
                 dec_len=self.prediction_length,
                 encoder_series_fields=[
-                    FieldName.FEAT_DYNAMIC_REAL
+                    FieldName.FEAT_DYNAMIC
                 ],  # TODO: later add categorical too
             ),
         )
@@ -235,7 +235,6 @@ def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
             enc2dec=PassThroughEnc2Dec(),
             decoder=self.decoder,
             quantile_output=self.quantile_output,
-            use_dynamic_feat=self.use_dynamic_feat,
         )
 
     def create_predictor(
@@ -254,7 +253,6 @@ def create_predictor(
             enc2dec=trained_network.enc2dec,
             decoder=trained_network.decoder,
             quantile_output=trained_network.quantile_output,
-            use_dynamic_feat=trained_network.use_dynamic_feat,
         )
 
         copy_parameters(trained_network, prediction_network)
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 95d1cba6df..2d9f9f5721 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -50,8 +50,6 @@ def __init__(
         enc2dec: Seq2SeqEnc2Dec,
         decoder: Seq2SeqDecoder,
         quantile_output: QuantileOutput,
-        use_dynamic_feat: bool,
-        # use_static_feat: bool,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -60,8 +58,6 @@ def __init__(
         self.enc2dec = enc2dec
         self.decoder = decoder
         self.quantile_output = quantile_output
-        self.use_dynamic_feat = use_dynamic_feat
-        # self.use_static_feat = use_static_feat
 
         with self.name_scope():
             self.quantile_proj = quantile_output.get_quantile_proj()
@@ -69,14 +65,14 @@ def __init__(
 
     # this method connects the sub-networks and returns the decoder output
     def get_decoder_network_output(
-        self, F, past_target: Tensor, past_feat_dynamic_real: Tensor
+        self, F, past_target: Tensor, past_feat_dynamic: Tensor
     ) -> Tensor:
         feat_static_real = F.zeros(shape=(1,))
         future_feat_dynamic_real = F.zeros(shape=(1,))
 
         # arguments: target, static_features, dynamic_features
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_real
+            past_target, feat_static_real, past_feat_dynamic
         )
 
         # arguments: encoder_output_static, encoder_output_dynamic, future_features
@@ -98,7 +94,7 @@ def hybrid_forward(
         self,
         F,
         past_target: Tensor,
-        past_feat_dynamic_real: Tensor,
+        past_feat_dynamic: Tensor,
         future_target: Tensor,
     ) -> Tensor:
         """
@@ -116,7 +112,7 @@ def hybrid_forward(
         loss with shape (FIXME, FIXME)
         """
         dec_output = self.get_decoder_network_output(
-            F, past_target, past_feat_dynamic_real
+            F, past_target, past_feat_dynamic
         )
 
         dec_dist_output = self.quantile_proj(dec_output)
@@ -128,7 +124,7 @@ def hybrid_forward(
 class ForkingSeq2SeqPredictionNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
     def hybrid_forward(
-        self, F, past_target: Tensor, past_feat_dynamic_real: Tensor
+        self, F, past_target: Tensor, past_feat_dynamic: Tensor
     ) -> Tensor:
         """
         Parameters
@@ -143,7 +139,7 @@ def hybrid_forward(
         prediction tensor with shape (FIXME, FIXME)
         """
         dec_output = self.get_decoder_network_output(
-            F, past_target, past_feat_dynamic_real
+            F, past_target, past_feat_dynamic
         )
 
         fcst_output = F.slice_axis(dec_output, axis=1, begin=-1, end=None)
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 34f8136352..2857fed8e7 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -28,12 +28,6 @@
 from gluonts.model.seq2seq._forking_estimator import ForkingSeq2SeqEstimator
 
 
-# TODO: in general, it seems unnecessary to put the MQCNN and MQRNN into Seq2Seq since their commonality in code with
-#  the rest is just the abstract classes Seq2SeqDecoder and Se2SeqEncoder,
-#  and the Estimator is not based on Seq2SeqEstimator!
-
-
-# TODO: integrate MQDNN, change arguments to non mutable
 class MQCNNEstimator(ForkingSeq2SeqEstimator):
     """
     An :class:`MQDNNEstimator` with a Convolutional Neural Network (CNN) as an
@@ -53,38 +47,69 @@ def __init__(
         add_time_feature: bool = False,
         add_age_feature: bool = False,
         seed: Optional[int] = None,
-        decoder_mlp_dim_seq: List[int] = [20],
-        channels_seq: List[int] = [30, 30, 30],
-        dilation_seq: List[int] = [1, 3, 9],
-        kernel_size_seq: List[int] = [3, 3, 3],
+        decoder_mlp_dim_seq: Optional[List[int]] = None,
+        channels_seq: Optional[List[int]] = None,
+        dilation_seq: Optional[List[int]] = None,
+        kernel_size_seq: Optional[List[int]] = None,
         use_residual: bool = True,
-        quantiles: List[float] = list(
-            [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
-        ),
+        quantiles: Optional[List[float]] = None,
         trainer: Trainer = Trainer(),
     ) -> None:
 
-        if seed:
-            np.random.seed(seed)
-            mx.random.seed(seed)
-
-        assert (
-            len(channels_seq) == len(dilation_seq) == len(kernel_size_seq)
-        ), (
-            f"mismatch CNN configurations: {len(channels_seq)} vs. "
-            f"{len(dilation_seq)} vs. {len(kernel_size_seq)}"
-        )
         assert (
             prediction_length > 0
         ), f"Invalid prediction length: {prediction_length}."
-        assert all(
-            [d > 0 for d in decoder_mlp_dim_seq]
+        assert decoder_mlp_dim_seq is None or all(
+            d > 0 for d in decoder_mlp_dim_seq
         ), "Elements of `mlp_hidden_dimension_seq` should be > 0"
+        assert channels_seq is None or all(
+            [d > 0 for d in channels_seq]
+        ), "Elements of `channels_seq` should be > 0"
+        assert dilation_seq is None or all(
+            [d > 0 for d in dilation_seq]
+        ), "Elements of `dilation_seq` should be > 0"
+        assert kernel_size_seq is None or all(
+            [d > 0 for d in kernel_size_seq]
+        ), "Elements of `kernel_size_seq` should be > 0"
+        assert quantiles is None or all(
+            [0 <= d <= 1 for d in quantiles]
+        ), "Elements of `quantiles` should be >= 0 and <= 1"
+
+        self.decoder_mlp_dim_seq = (
+            decoder_mlp_dim_seq if decoder_mlp_dim_seq is not None else [20]
+        )
+        self.channels_seq = (
+            channels_seq if channels_seq is not None else [30, 30, 30]
+        )
+        self.dilation_seq = (
+            dilation_seq if dilation_seq is not None else [1, 3, 9]
+        )
+        self.kernel_size_seq = (
+            kernel_size_seq if kernel_size_seq is not None else [3, 3, 3]
+        )
+        self.quantiles = (
+            quantiles
+            if quantiles is not None
+            else [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+        )
+
+        assert (
+            len(self.channels_seq)
+            == len(self.dilation_seq)
+            == len(self.kernel_size_seq)
+        ), (
+            f"mismatch CNN configurations: {len(self.channels_seq)} vs. "
+            f"{len(self.dilation_seq)} vs. {len(self.kernel_size_seq)}"
+        )
+
+        if seed:
+            np.random.seed(seed)
+            mx.random.seed(seed)
 
         encoder = HierarchicalCausalConv1DEncoder(
-            dilation_seq=dilation_seq,
-            kernel_size_seq=kernel_size_seq,
-            channels_seq=channels_seq,
+            dilation_seq=self.dilation_seq,
+            kernel_size_seq=self.kernel_size_seq,
+            channels_seq=self.channels_seq,
             use_residual=use_residual,
             use_static_feat=False,
             use_dynamic_feat=True,
@@ -93,12 +118,12 @@ def __init__(
 
         decoder = ForkingMLPDecoder(
             dec_len=prediction_length,
-            final_dim=decoder_mlp_dim_seq[-1],
-            hidden_dimension_sequence=decoder_mlp_dim_seq[:-1],
+            final_dim=self.decoder_mlp_dim_seq[-1],
+            hidden_dimension_sequence=self.decoder_mlp_dim_seq[:-1],
             prefix="decoder_",
         )
 
-        quantile_output = QuantileOutput(quantiles)
+        quantile_output = QuantileOutput(self.quantiles)
 
         super().__init__(
             encoder=encoder,
@@ -124,7 +149,6 @@ def derive_auto_fields(cls, train_iter):
         }
 
 
-# TODO: integrate MQDNN, change arguments to non mutable
 class MQRNNEstimator(ForkingSeq2SeqEstimator):
     """
     An :class:`MQDNNEstimator` with a Recurrent Neural Network (RNN) as an
@@ -137,17 +161,27 @@ def __init__(
         prediction_length: int,
         freq: str,
         context_length: Optional[int] = None,
-        decoder_mlp_dim_seq: List[int] = [20],
+        decoder_mlp_dim_seq: List[int] = None,
         trainer: Trainer = Trainer(),
-        quantiles: List[float] = list([0.1, 0.5, 0.9]),
+        quantiles: List[float] = None,
     ) -> None:
 
         assert (
             prediction_length > 0
         ), f"Invalid prediction length: {prediction_length}."
-        assert all(
+        assert decoder_mlp_dim_seq is None or all(
             [d > 0 for d in decoder_mlp_dim_seq]
         ), "Elements of `mlp_hidden_dimension_seq` should be > 0"
+        assert quantiles is None or all(
+            [0 <= d <= 1 for d in quantiles]
+        ), "Elements of `quantiles` should be >= 0 and <= 1"
+
+        self.decoder_mlp_dim_seq = (
+            decoder_mlp_dim_seq if decoder_mlp_dim_seq is not None else [20]
+        )
+        self.quantiles = (
+            quantiles if quantiles is not None else [0.1, 0.5, 0.9]
+        )
 
         encoder = RNNEncoder(
             mode="gru",
@@ -161,12 +195,12 @@ def __init__(
 
         decoder = ForkingMLPDecoder(
             dec_len=prediction_length,
-            final_dim=decoder_mlp_dim_seq[-1],
-            hidden_dimension_sequence=decoder_mlp_dim_seq[:-1],
+            final_dim=self.decoder_mlp_dim_seq[-1],
+            hidden_dimension_sequence=self.decoder_mlp_dim_seq[:-1],
             prefix="decoder_",
         )
 
-        quantile_output = QuantileOutput(quantiles)
+        quantile_output = QuantileOutput(self.quantiles)
 
         super().__init__(
             encoder=encoder,

From 62395f6578752c5a15e26ae376eddeb8a7969811 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Mon, 20 Apr 2020 20:02:53 +0200
Subject: [PATCH 22/44] Added use_feat_static_cat support and observed_values
 support.

---
 .../model/seq2seq/_forking_estimator.py       | 132 +++++++++++++----
 src/gluonts/model/seq2seq/_forking_network.py | 135 +++++++++++++++---
 .../model/seq2seq/_mq_dnn_estimator.py        |  18 ++-
 src/gluonts/model/seq2seq/_transform.py       |  36 +++--
 test/model/seq2seq/test_model.py              |   9 +-
 5 files changed, 260 insertions(+), 70 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 3fb9a422e3..effadd2e44 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -12,14 +12,17 @@
 # permissions and limitations under the License.
 
 # Standard library imports
-from typing import Optional
+from typing import Optional, List
+
+# Third-party imports
+import numpy as np
 
 # First-party imports
 from gluonts.block.decoder import Seq2SeqDecoder
 from gluonts.block.enc2dec import PassThroughEnc2Dec
 from gluonts.block.encoder import Seq2SeqEncoder
 from gluonts.block.quantile_output import QuantileOutput
-from gluonts.core.component import validated
+from gluonts.core.component import validated, DType
 from gluonts.dataset.field_names import FieldName
 from gluonts.model.estimator import GluonEstimator
 from gluonts.model.forecast import Quantile
@@ -38,6 +41,9 @@
     RenameFields,
     AddConstFeature,
     RemoveFields,
+    AsNumpyArray,
+    AddObservedValuesIndicator,
+    SetField,
 )
 
 # Relative imports
@@ -84,17 +90,29 @@ class ForkingSeq2SeqEstimator(GluonEstimator):
         frequency of the time series
     prediction_length
         length of the decoding sequence
+    context_length
+        length of the encoding sequence (prediction_length is used if None)
     use_feat_dynamic_real
         Whether to use the ``feat_dynamic_real`` field from the data (default: False)
+    use_feat_static_cat:
+        Whether to use the ``feat_static_cat`` field from the data (default: False)
+    cardinality: List[int] = None,
+        Number of values of each categorical feature.
+        This must be set if ``use_feat_static_cat == True`` (default: None)
+    embedding_dimension: List[int] = None,
+        Dimension of the embeddings for categorical features
+        (default: [min(50, (cat+1)//2) for cat in cardinality])
     add_time_feature
         Adds a set of time features.
     add_age_feature
         Adds an age feature.
         The age feature starts with a small value at the start of the time series and grows over time.
-    context_length
-        length of the encoding sequence (prediction_length is used if None)
     trainer
-        trainer
+        trainer (default: Trainer())
+    dummy_value
+        Value to use for replacing missing values (default: 0.0)
+    dtype
+        (default: np.float32)
     """
 
     @validated()
@@ -107,9 +125,14 @@ def __init__(
         prediction_length: int,
         context_length: Optional[int] = None,
         use_feat_dynamic_real: bool = False,
+        use_feat_static_cat: bool = False,
+        cardinality: List[int] = None,
+        embedding_dimension: List[int] = None,
         add_time_feature: bool = False,
         add_age_feature: bool = False,
         trainer: Trainer = Trainer(),
+        dummy_value: float = 0.0,
+        dtype: DType = np.float32,
     ) -> None:
         super().__init__(trainer=trainer)
 
@@ -119,15 +142,15 @@ def __init__(
         assert (
             prediction_length > 0
         ), "The value of `prediction_length` should be > 0"
-        # assert (cardinality and use_feat_static_cat) or (
-        #     not (cardinality or use_feat_static_cat)
-        # ), "You should set `cardinality` if and only if `use_feat_static_cat=True`"
-        # assert cardinality is None or all(
-        #     [c > 0 for c in cardinality]
-        # ), "Elements of `cardinality` should be > 0"
-        # assert embedding_dimension is None or all(
-        #     [e > 0 for e in embedding_dimension]
-        # ), "Elements of `embedding_dimension` should be > 0"
+        assert (cardinality and use_feat_static_cat) or (
+            not (cardinality or use_feat_static_cat)
+        ), "You should set `cardinality` if and only if `use_feat_static_cat=True`"
+        assert cardinality is None or all(
+            [c > 0 for c in cardinality]
+        ), "Elements of `cardinality` should be > 0"
+        assert embedding_dimension is None or all(
+            [e > 0 for e in embedding_dimension]
+        ), "Elements of `embedding_dimension` should be > 0"
 
         self.encoder = encoder
         self.decoder = decoder
@@ -140,25 +163,53 @@ def __init__(
             else self.prediction_length
         )
         self.use_feat_dynamic_real = use_feat_dynamic_real
+        self.use_feat_static_cat = use_feat_static_cat
+        self.cardinality = (
+            cardinality if cardinality and use_feat_static_cat else [1]
+        )
+        self.embedding_dimension = (
+            embedding_dimension
+            if embedding_dimension is not None
+            else [min(50, (cat + 1) // 2) for cat in self.cardinality]
+        )
         self.add_time_feature = add_time_feature
         self.add_age_feature = add_age_feature
         self.use_dynamic_feat = (
             use_feat_dynamic_real or add_age_feature or add_time_feature
         )
 
-        # self.use_feat_static_cat = use_feat_static_cat
-        # self.cardinality = (
-        #     cardinality if cardinality and use_feat_static_cat else [1]
-        # )
-        # self.embedding_dimension = (
-        #     embedding_dimension
-        #     if embedding_dimension is not None
-        #     else [min(50, (cat + 1) // 2) for cat in self.cardinality]
-        # )
+        self.dummy_value = dummy_value
+        self.dtype = dtype
 
     def create_transformation(self) -> Transformation:
         chain = []
         dynamic_feat_fields = []
+        remove_field_names = [FieldName.FEAT_DYNAMIC_CAT]
+
+        # --- GENERAL TRANSFORMATION CHAIN ---
+
+        # determine unused input
+        if not self.use_feat_dynamic_real:
+            remove_field_names.append(FieldName.FEAT_DYNAMIC_REAL)
+        if not self.use_feat_static_cat:
+            remove_field_names.append(FieldName.FEAT_STATIC_CAT)
+
+        chain.extend(
+            [
+                RemoveFields(field_names=remove_field_names),
+                AsNumpyArray(
+                    field=FieldName.TARGET, expected_ndim=1, dtype=self.dtype
+                ),
+                AddObservedValuesIndicator(
+                    target_field=FieldName.TARGET,
+                    output_field=FieldName.OBSERVED_VALUES,
+                    dummy_value=self.dummy_value,
+                    dtype=self.dtype,
+                ),
+            ]
+        )
+
+        # --- TRANSFORMATION CHAIN FOR DYNAMIC FEATURES ---
 
         if self.add_time_feature:
             chain.append(
@@ -178,16 +229,13 @@ def create_transformation(self) -> Transformation:
                     target_field=FieldName.TARGET,
                     output_field=FieldName.FEAT_AGE,
                     pred_length=self.prediction_length,
+                    dtype=self.dtype,
                 ),
             )
             dynamic_feat_fields.append(FieldName.FEAT_AGE)
 
         if self.use_feat_dynamic_real:
             dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
-        else:
-            chain.append(
-                RemoveFields(field_names=[FieldName.FEAT_DYNAMIC_REAL])
-            )
 
         # we need to make sure that there is always some dynamic input
         # we will however disregard it in the hybrid forward
@@ -197,6 +245,7 @@ def create_transformation(self) -> Transformation:
                     target_field=FieldName.TARGET,
                     output_field=FieldName.FEAT_CONST,
                     pred_length=self.prediction_length,
+                    dtype=self.dtype,
                 ),
             )
             dynamic_feat_fields.append(FieldName.FEAT_CONST)
@@ -214,6 +263,22 @@ def create_transformation(self) -> Transformation:
                 RenameFields({dynamic_feat_fields[0]: FieldName.FEAT_DYNAMIC})
             )
 
+        # --- TRANSFORMATION CHAIN FOR STATIC FEATURES ---
+
+        if not self.use_feat_static_cat:
+            chain.append(
+                SetField(output_field=FieldName.FEAT_STATIC_CAT, value=[0.0]),
+            )
+        chain.append(
+            AsNumpyArray(
+                field=FieldName.FEAT_STATIC_CAT,
+                expected_ndim=1,
+                dtype=self.dtype,
+            ),
+        )
+
+        # --- SAMPLE AND CUT THE TIME-SERIES ---
+
         chain.append(
             # because of how the forking decoder works, every time step
             # in context is used for splitting, which is why we use the TestSplitSampler
@@ -222,8 +287,9 @@ def create_transformation(self) -> Transformation:
                 enc_len=self.context_length,
                 dec_len=self.prediction_length,
                 encoder_series_fields=[
-                    FieldName.FEAT_DYNAMIC
-                ],  # TODO: later add categorical too
+                    FieldName.FEAT_DYNAMIC,
+                    FieldName.OBSERVED_VALUES,
+                ],
             ),
         )
 
@@ -235,6 +301,10 @@ def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
             enc2dec=PassThroughEnc2Dec(),
             decoder=self.decoder,
             quantile_output=self.quantile_output,
+            context_length=self.context_length,
+            cardinality=self.cardinality,
+            embedding_dimension=self.embedding_dimension,
+            dtype=self.dtype,
         )
 
     def create_predictor(
@@ -253,6 +323,10 @@ def create_predictor(
             enc2dec=trained_network.enc2dec,
             decoder=trained_network.decoder,
             quantile_output=trained_network.quantile_output,
+            context_length=self.context_length,
+            cardinality=self.cardinality,
+            embedding_dimension=self.embedding_dimension,
+            dtype=self.dtype,
         )
 
         copy_parameters(trained_network, prediction_network)
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 2d9f9f5721..651f894497 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -11,10 +11,13 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
+# Third-party imports
+from typing import List
+
 # Third-party imports
 import mxnet as mx
 from mxnet import gluon
-
+import numpy as np
 
 # First-party imports
 from gluonts.block.decoder import Seq2SeqDecoder
@@ -23,6 +26,10 @@
 from gluonts.block.quantile_output import QuantileOutput
 from gluonts.core.component import validated
 from gluonts.model.common import Tensor
+from gluonts.block.feature import FeatureEmbedder
+from gluonts.block.scaler import MeanScaler, NOPScaler
+from gluonts.core.component import DType
+from gluonts.support.util import weighted_average
 
 
 class ForkingSeq2SeqNetworkBase(gluon.HybridBlock):
@@ -39,6 +46,14 @@ class ForkingSeq2SeqNetworkBase(gluon.HybridBlock):
         decoder block
     quantile_output: QuantileOutput
         quantile output block
+    context_length: int,
+        length of the encoding sequence
+    cardinality: List[int],
+        number of values of each categorical feature.
+    embedding_dimension: List[int],
+        dimension of the embeddings for categorical features
+    dtype
+        (default: np.float32)
     kwargs: dict
         dictionary of Gluon HybridBlock parameters
     """
@@ -50,6 +65,10 @@ def __init__(
         enc2dec: Seq2SeqEnc2Dec,
         decoder: Seq2SeqDecoder,
         quantile_output: QuantileOutput,
+        context_length: int,
+        cardinality: List[int],
+        embedding_dimension: List[int],
+        dtype: DType = np.float32,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -58,26 +77,71 @@ def __init__(
         self.enc2dec = enc2dec
         self.decoder = decoder
         self.quantile_output = quantile_output
+        self.context_length = context_length
+        self.cardinality = cardinality
+        self.embedding_dimension = embedding_dimension
+        self.dtype = dtype
+
+        # TODO: implement scaling
+        scaling = False
+        if scaling:
+            self.scaler = MeanScaler(keepdims=True)
+        else:
+            self.scaler = NOPScaler(keepdims=True)
 
         with self.name_scope():
             self.quantile_proj = quantile_output.get_quantile_proj()
             self.loss = quantile_output.get_loss()
+            self.embedder = FeatureEmbedder(
+                cardinalities=cardinality,
+                embedding_dims=embedding_dimension,
+                dtype=self.dtype,
+            )
 
     # this method connects the sub-networks and returns the decoder output
     def get_decoder_network_output(
-        self, F, past_target: Tensor, past_feat_dynamic: Tensor
+        self,
+        F,
+        past_target: Tensor,
+        past_feat_dynamic: Tensor,
+        feat_static_cat: Tensor,
+        past_observed_values: Tensor,
     ) -> Tensor:
-        feat_static_real = F.zeros(shape=(1,))
-        future_feat_dynamic_real = F.zeros(shape=(1,))
+
+        # scale is computed on the context length last units of the past target
+        # scale shape is (batch_size, 1, *target_shape)
+        _, scale = self.scaler(
+            past_target.slice_axis(
+                axis=1, begin=-self.context_length, end=None
+            ),
+            past_observed_values.slice_axis(
+                axis=1, begin=-self.context_length, end=None
+            ),
+        )
+
+        # (batch_size, num_features)
+        embedded_cat = self.embedder(feat_static_cat)
+
+        # in addition to embedding features, use the log scale as it can help prediction too
+        # (batch_size, num_features + prod(target_shape))
+        feat_static_real = F.concat(
+            embedded_cat, F.log(scale.squeeze(axis=1)), dim=1,
+        )
+
+        # Passing past_observed_values as a feature would allow the network to
+        # make that distinction and possibly ignore the masked values.
+        past_feat_dynamic_extended = F.concat(
+            past_feat_dynamic, past_observed_values, dim=-1
+        )
 
         # arguments: target, static_features, dynamic_features
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic
+            past_target, feat_static_real, past_feat_dynamic_extended
         )
 
         # arguments: encoder_output_static, encoder_output_dynamic, future_features
         dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, future_feat_dynamic_real
+            enc_output_static, enc_output_dynamic, F.zeros(shape=(1,))
         )
 
         # arguments: dynamic_input, static_input
@@ -86,45 +150,67 @@ def get_decoder_network_output(
         return dec_output
 
 
-# TODO: figure out whether we need 2 classes each, in fact we would need 4 each,
-#  if adding categorical with this technique, does not seem reasonable
 class ForkingSeq2SeqTrainingNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
     def hybrid_forward(
         self,
         F,
+        future_target: Tensor,
         past_target: Tensor,
         past_feat_dynamic: Tensor,
-        future_target: Tensor,
+        feat_static_cat: Tensor,
+        past_observed_values: Tensor,  # FOR SOME REASON NOT USED???
+        future_observed_values: Tensor,
     ) -> Tensor:
         """
         Parameters
         ----------
         F: mx.symbol or mx.ndarray
             Gluon function space
-        past_target: Tensor
-            FIXME
         future_target: Tensor
-            shape (num_ts, encoder_length, 1) FIXME
+            shape (batch_size, encoder_length, decoder_length)
+        past_target: Tensor
+            shape (batch_size, encoder_length, 1)
+        feat_static_cat
+            shape (batch_size, encoder_length, num_feature_static_cat)
+        past_feat_dynamic
+            shape (batch_size, encoder_length, num_feature_dynamic)
+        past_observed_values: Tensor
+            shape (batch_size, encoder_length, 1)
+        future_observed_values: Tensor
+            shape (batch_size, encoder_length, decoder_length)
 
         Returns
         -------
-        loss with shape (FIXME, FIXME)
+        loss with shape (batch_size, prediction_length)
         """
         dec_output = self.get_decoder_network_output(
-            F, past_target, past_feat_dynamic
+            F,
+            past_target,
+            past_feat_dynamic,
+            feat_static_cat,
+            past_observed_values,
         )
 
         dec_dist_output = self.quantile_proj(dec_output)
         loss = self.loss(future_target, dec_dist_output)
 
-        return loss.mean(axis=1)
+        weighted_loss = weighted_average(
+            F=F, x=loss, weights=future_observed_values, axis=1
+        )
+
+        return weighted_loss
 
 
 class ForkingSeq2SeqPredictionNetwork(ForkingSeq2SeqNetworkBase):
     # noinspection PyMethodOverriding
     def hybrid_forward(
-        self, F, past_target: Tensor, past_feat_dynamic: Tensor
+        self,
+        F,
+        past_target: Tensor,
+        past_feat_dynamic: Tensor,
+        feat_static_cat: Tensor,
+        past_observed_values: Tensor,
     ) -> Tensor:
         """
         Parameters
@@ -132,14 +218,25 @@ def hybrid_forward(
         F: mx.symbol or mx.ndarray
             Gluon function space
         past_target: Tensor
-            FIXME
+             shape (batch_size, encoder_length, 1)
+        feat_static_cat
+            shape (batch_size, encoder_length, num_feature_static_cat)
+        past_feat_dynamic
+            shape (batch_size, encoder_length, num_feature_dynamic)
+        past_observed_values: Tensor
+            shape (batch_size, encoder_length, 1)
 
         Returns
         -------
-        prediction tensor with shape (FIXME, FIXME)
+        prediction tensor with shape (batch_size, prediction_length)
         """
+
         dec_output = self.get_decoder_network_output(
-            F, past_target, past_feat_dynamic
+            F,
+            past_target,
+            past_feat_dynamic,
+            feat_static_cat,
+            past_observed_values,
         )
 
         fcst_output = F.slice_axis(dec_output, axis=1, begin=-1, end=None)
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 2857fed8e7..1a896988b4 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -68,8 +68,9 @@ def __init__(
         assert dilation_seq is None or all(
             [d > 0 for d in dilation_seq]
         ), "Elements of `dilation_seq` should be > 0"
+        # TODO: add support for kernel size=1
         assert kernel_size_seq is None or all(
-            [d > 0 for d in kernel_size_seq]
+            [d > 1 for d in kernel_size_seq]
         ), "Elements of `kernel_size_seq` should be > 0"
         assert quantiles is None or all(
             [0 <= d <= 1 for d in quantiles]
@@ -106,12 +107,14 @@ def __init__(
             np.random.seed(seed)
             mx.random.seed(seed)
 
+        # `use_static_feat` and `use_dynamic_feat` always True because network
+        # always receives input; either from the input data or constants
         encoder = HierarchicalCausalConv1DEncoder(
             dilation_seq=self.dilation_seq,
             kernel_size_seq=self.kernel_size_seq,
             channels_seq=self.channels_seq,
             use_residual=use_residual,
-            use_static_feat=False,
+            use_static_feat=True,
             use_dynamic_feat=True,
             prefix="encoder_",
         )
@@ -133,6 +136,9 @@ def __init__(
             prediction_length=prediction_length,
             context_length=context_length,
             use_feat_dynamic_real=use_feat_dynamic_real,
+            use_feat_static_cat=use_feat_static_cat,
+            cardinality=cardinality,
+            embedding_dimension=embedding_dimension,
             add_time_feature=add_time_feature,
             add_age_feature=add_age_feature,
             trainer=trainer,
@@ -144,8 +150,8 @@ def derive_auto_fields(cls, train_iter):
 
         return {
             "use_feat_dynamic_real": stats.num_feat_dynamic_real > 0,
-            # "use_feat_static_cat": bool(stats.feat_static_cat),
-            # "cardinality": [len(cats) for cats in stats.feat_static_cat],
+            "use_feat_static_cat": bool(stats.feat_static_cat),
+            "cardinality": [len(cats) for cats in stats.feat_static_cat],
         }
 
 
@@ -183,13 +189,15 @@ def __init__(
             quantiles if quantiles is not None else [0.1, 0.5, 0.9]
         )
 
+        # `use_static_feat` and `use_dynamic_feat` always True because network
+        # always receives input; either from the input data or constants
         encoder = RNNEncoder(
             mode="gru",
             hidden_size=50,
             num_layers=1,
             bidirectional=True,
             prefix="encoder_",
-            use_static_feat=False,
+            use_static_feat=True,
             use_dynamic_feat=True,
         )
 
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index aede54faa5..dbae874c86 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -13,7 +13,7 @@
 
 # Standard library imports
 from collections import Counter
-from typing import Iterator, List, Any
+from typing import Iterator, List, Any, Optional
 
 # Third-party imports
 import numpy as np
@@ -21,6 +21,7 @@
 # First-party imports
 from gluonts.core.component import validated
 from gluonts.dataset.common import DataEntry
+from gluonts.dataset.field_names import FieldName
 from gluonts.transform import FlatMapTransformation, shift_timestamp
 
 
@@ -43,9 +44,10 @@ def __init__(
         train_sampler,
         enc_len: int,
         dec_len: int,
-        target_in: str = "target",
-        encoder_series_fields: List[str] = None,
-        decoder_series_fields: List[str] = [],
+        target_in: str = FieldName.TARGET,
+        observed_in: str = FieldName.OBSERVED_VALUES,
+        encoder_series_fields: Optional[List[str]] = None,
+        decoder_series_fields: Optional[List[str]] = None,
         is_pad_out: str = "is_pad",
         start_input_field: str = "start",
         forecast_start_output_field: str = "forecast_start",
@@ -57,20 +59,21 @@ def __init__(
         self.train_sampler = train_sampler
         self.enc_len = enc_len
         self.dec_len = dec_len
-        self.ts_fields = (
-            encoder_series_fields if encoder_series_fields is not None else []
-        )
         self.target_in = target_in
+        self.observed_in = observed_in
         self.is_pad_out = is_pad_out
         self.start_in = start_input_field
         self.forecast_start_out = forecast_start_output_field
-        self.decoder_series_fields = decoder_series_fields
+        self.ts_fields = (
+            encoder_series_fields if encoder_series_fields is not None else []
+        )
+        self.decoder_series_fields = (
+            decoder_series_fields if decoder_series_fields is not None else []
+        )
 
-    # TODO: make use of these
     def _past(self, col_name):
         return f"past_{col_name}"
 
-    # TODO: make use of these
     def _future(self, col_name):
         return f"future_{col_name}"
 
@@ -93,7 +96,9 @@ def flatmap_transform(
         else:
             sampling_indices = [len(target)]
 
-        decoder_fields = set([self.target_in] + self.decoder_series_fields)
+        decoder_fields = set(
+            [self.target_in, self.observed_in] + self.decoder_series_fields
+        )
 
         ts_fields_counter = Counter(
             self.ts_fields + [self.target_in] + self.decoder_series_fields
@@ -124,11 +129,16 @@ def flatmap_transform(
                 out[self._past(ts_field)] = past_piece.transpose()
 
                 # in prediction mode, don't provide decode-values
-                if not is_train and ts_field == self.target_in:
+                if not is_train and (
+                    ts_field in [self.target_in, self.observed_in]
+                ):
                     continue
 
                 if ts_field in decoder_fields:
-                    d3: Any = () if ts_field == self.target_in else (len(ts),)
+                    d3: Any = () if ts_field in [
+                        self.target_in,
+                        self.observed_in,
+                    ] else (len(ts),)
                     forking_dec_field = np.zeros(
                         shape=(self.enc_len, self.dec_len) + d3
                     )
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index 385020c41f..9da983308c 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -56,8 +56,9 @@ def test_accuracy(
 @pytest.mark.parametrize("use_feat_dynamic_real", [True, False])
 @pytest.mark.parametrize("add_time_feature", [True, False])
 @pytest.mark.parametrize("add_age_feature", [True, False])
+@pytest.mark.parametrize("hybridize", [True, False])
 def test_mqcnn_covariate_smoke_test(
-    use_feat_dynamic_real, add_time_feature, add_age_feature
+    use_feat_dynamic_real, add_time_feature, add_age_feature, hybridize
 ):
     hps = {
         "seed": 42,
@@ -69,12 +70,12 @@ def test_mqcnn_covariate_smoke_test(
         "use_feat_dynamic_real": use_feat_dynamic_real,
         "add_time_feature": add_time_feature,
         "add_age_feature": add_age_feature,
-        "hybridize": True,
+        "hybridize": hybridize,
     }
 
     dataset_train, dataset_test = make_dummy_datasets_with_features(
-        cardinality=[3, 10, 42],
-        num_feat_dynamic_real=3,
+        cardinality=[3, 10],
+        num_feat_dynamic_real=2,
         freq=hps["freq"],
         prediction_length=hps["prediction_length"],
     )

From 8d7b87d2848f9c7b9cb05fd8b20195a253e5e820 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Mon, 20 Apr 2020 21:57:06 +0200
Subject: [PATCH 23/44] Minor refactoring.

---
 .../model/seq2seq/_forking_estimator.py       | 19 +++-----
 src/gluonts/model/seq2seq/_forking_network.py |  1 +
 .../model/seq2seq/_seq2seq_estimator.py       | 19 ++++----
 src/gluonts/model/seq2seq/_transform.py       | 47 ++++++++++---------
 .../seq2seq/test_forking_sequence_splitter.py |  1 -
 test/model/seq2seq/test_model.py              |  2 +-
 6 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index effadd2e44..b4a3f73de0 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -109,8 +109,6 @@ class ForkingSeq2SeqEstimator(GluonEstimator):
         The age feature starts with a small value at the start of the time series and grows over time.
     trainer
         trainer (default: Trainer())
-    dummy_value
-        Value to use for replacing missing values (default: 0.0)
     dtype
         (default: np.float32)
     """
@@ -131,7 +129,6 @@ def __init__(
         add_time_feature: bool = False,
         add_age_feature: bool = False,
         trainer: Trainer = Trainer(),
-        dummy_value: float = 0.0,
         dtype: DType = np.float32,
     ) -> None:
         super().__init__(trainer=trainer)
@@ -177,14 +174,15 @@ def __init__(
         self.use_dynamic_feat = (
             use_feat_dynamic_real or add_age_feature or add_time_feature
         )
-
-        self.dummy_value = dummy_value
         self.dtype = dtype
 
     def create_transformation(self) -> Transformation:
         chain = []
         dynamic_feat_fields = []
-        remove_field_names = [FieldName.FEAT_DYNAMIC_CAT]
+        remove_field_names = [
+            FieldName.FEAT_DYNAMIC_CAT,
+            FieldName.FEAT_STATIC_REAL,
+        ]
 
         # --- GENERAL TRANSFORMATION CHAIN ---
 
@@ -203,7 +201,6 @@ def create_transformation(self) -> Transformation:
                 AddObservedValuesIndicator(
                     target_field=FieldName.TARGET,
                     output_field=FieldName.OBSERVED_VALUES,
-                    dummy_value=self.dummy_value,
                     dtype=self.dtype,
                 ),
             ]
@@ -286,10 +283,8 @@ def create_transformation(self) -> Transformation:
                 train_sampler=TestSplitSampler(),
                 enc_len=self.context_length,
                 dec_len=self.prediction_length,
-                encoder_series_fields=[
-                    FieldName.FEAT_DYNAMIC,
-                    FieldName.OBSERVED_VALUES,
-                ],
+                encoder_series_fields=[FieldName.FEAT_DYNAMIC],
+                shared_series_fields=[FieldName.OBSERVED_VALUES],
             ),
         )
 
@@ -312,7 +307,7 @@ def create_predictor(
         transformation: Transformation,
         trained_network: ForkingSeq2SeqNetworkBase,
     ) -> Predictor:
-        # todo: this is specific to quantile output
+        # this is specific to quantile output
         quantile_strs = [
             Quantile.from_float(quantile).name
             for quantile in self.quantile_output.quantiles
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 651f894497..5e783fa769 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -195,6 +195,7 @@ def hybrid_forward(
         dec_dist_output = self.quantile_proj(dec_output)
         loss = self.loss(future_target, dec_dist_output)
 
+        # mask the loss based on observed indicator
         weighted_loss = weighted_average(
             F=F, x=loss, weights=future_observed_values, axis=1
         )
diff --git a/src/gluonts/model/seq2seq/_seq2seq_estimator.py b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
index c50ae96bb4..cf9756300b 100644
--- a/src/gluonts/model/seq2seq/_seq2seq_estimator.py
+++ b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
@@ -45,7 +45,6 @@
 from ._seq2seq_network import Seq2SeqPredictionNetwork, Seq2SeqTrainingNetwork
 
 
-# TODO: fix mutable arguments
 class Seq2SeqEstimator(GluonEstimator):
     """
     Quantile-Regression Sequence-to-Sequence Estimator
@@ -64,7 +63,7 @@ def __init__(
         decoder_mlp_static_dim: int,
         scaler: Scaler = NOPScaler(),
         context_length: Optional[int] = None,
-        quantiles: List[float] = [0.1, 0.5, 0.9],
+        quantiles: Optional[List[float]] = None,
         trainer: Trainer = Trainer(),
         num_parallel_samples: int = 100,
     ) -> None:
@@ -74,6 +73,9 @@ def __init__(
         assert (
             context_length is None or context_length > 0
         ), "The value of `context_length` should be > 0"
+        assert quantiles is None or all(
+            [0 <= d <= 1 for d in quantiles]
+        ), "Elements of `quantiles` should be >= 0 and <= 1"
 
         super().__init__(trainer=trainer)
 
@@ -82,7 +84,9 @@ def __init__(
         )
         self.prediction_length = prediction_length
         self.freq = freq
-        self.quantiles = quantiles
+        self.quantiles = (
+            quantiles if quantiles is not None else [0.1, 0.5, 0.9]
+        )
         self.encoder = encoder
         self.decoder_mlp_layer = decoder_mlp_layer
         self.decoder_mlp_static_dim = decoder_mlp_static_dim
@@ -196,7 +200,7 @@ def __init__(
         decoder_mlp_static_dim: int,
         scaler: Scaler = NOPScaler(),
         context_length: Optional[int] = None,
-        quantiles: List[float] = list([0.1, 0.5, 0.9]),
+        quantiles: Optional[List[float]] = None,
         trainer: Trainer = Trainer(),
         num_parallel_samples: int = 100,
     ) -> None:
@@ -217,7 +221,6 @@ def __init__(
         )
 
 
-# TODO: fix mutable arguments
 class RNN2QRForecaster(Seq2SeqEstimator):
     @validated()
     def __init__(
@@ -234,7 +237,7 @@ def __init__(
         encoder_rnn_bidirectional: bool = True,
         scaler: Scaler = NOPScaler(),
         context_length: Optional[int] = None,
-        quantiles: List[float] = list([0.1, 0.5, 0.9]),
+        quantiles: Optional[List[float]] = None,
         trainer: Trainer = Trainer(),
         num_parallel_samples: int = 100,
     ) -> None:
@@ -262,7 +265,6 @@ def __init__(
         )
 
 
-# TODO: fix mutable arguments
 class CNN2QRForecaster(Seq2SeqEstimator):
     @validated()
     def __init__(
@@ -275,7 +277,7 @@ def __init__(
         decoder_mlp_static_dim: int,
         scaler: Scaler = NOPScaler(),
         context_length: Optional[int] = None,
-        quantiles: List[float] = list([0.1, 0.5, 0.9]),
+        quantiles: Optional[List[float]] = None,
         trainer: Trainer = Trainer(),
         num_parallel_samples: int = 100,
     ) -> None:
@@ -285,6 +287,7 @@ def __init__(
             channels_seq=[30, 30, 30],
             use_residual=True,
             use_dynamic_feat=True,
+            use_static_feat=True,
         )
 
         super(CNN2QRForecaster, self).__init__(
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index dbae874c86..9629efd873 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -44,13 +44,12 @@ def __init__(
         train_sampler,
         enc_len: int,
         dec_len: int,
-        target_in: str = FieldName.TARGET,
-        observed_in: str = FieldName.OBSERVED_VALUES,
+        target_field=FieldName.TARGET,
         encoder_series_fields: Optional[List[str]] = None,
         decoder_series_fields: Optional[List[str]] = None,
+        shared_series_fields: Optional[List[str]] = None,
         is_pad_out: str = "is_pad",
         start_input_field: str = "start",
-        forecast_start_output_field: str = "forecast_start",
     ) -> None:
 
         assert enc_len > 0, "The value of `enc_len` should be > 0"
@@ -59,17 +58,24 @@ def __init__(
         self.train_sampler = train_sampler
         self.enc_len = enc_len
         self.dec_len = dec_len
-        self.target_in = target_in
-        self.observed_in = observed_in
-        self.is_pad_out = is_pad_out
-        self.start_in = start_input_field
-        self.forecast_start_out = forecast_start_output_field
-        self.ts_fields = (
+        self.target_field = target_field
+
+        self.encoder_series_fields = (
             encoder_series_fields if encoder_series_fields is not None else []
         )
         self.decoder_series_fields = (
             decoder_series_fields if decoder_series_fields is not None else []
         )
+        # defines the fields that are shared among encoder and decoder,
+        # this includes the target by default
+        self.shared_series_fields = (
+            shared_series_fields + [self.target_field]
+            if shared_series_fields is not None
+            else [self.target_field]
+        )
+
+        self.is_pad_out = is_pad_out
+        self.start_in = start_input_field
 
     def _past(self, col_name):
         return f"past_{col_name}"
@@ -80,7 +86,7 @@ def _future(self, col_name):
     def flatmap_transform(
         self, data: DataEntry, is_train: bool
     ) -> Iterator[DataEntry]:
-        target = data[self.target_in]
+        target = data[self.target_field]
 
         if is_train:
             # We currently cannot handle time series that are shorter than the
@@ -97,11 +103,13 @@ def flatmap_transform(
             sampling_indices = [len(target)]
 
         decoder_fields = set(
-            [self.target_in, self.observed_in] + self.decoder_series_fields
+            self.shared_series_fields + self.decoder_series_fields
         )
 
         ts_fields_counter = Counter(
-            self.ts_fields + [self.target_in] + self.decoder_series_fields
+            self.encoder_series_fields
+            + self.shared_series_fields
+            + self.decoder_series_fields
         )
 
         for sampling_idx in sampling_indices:
@@ -129,16 +137,13 @@ def flatmap_transform(
                 out[self._past(ts_field)] = past_piece.transpose()
 
                 # in prediction mode, don't provide decode-values
-                if not is_train and (
-                    ts_field in [self.target_in, self.observed_in]
-                ):
+                if not is_train and (ts_field in self.shared_series_fields):
                     continue
 
                 if ts_field in decoder_fields:
-                    d3: Any = () if ts_field in [
-                        self.target_in,
-                        self.observed_in,
-                    ] else (len(ts),)
+                    d3: Any = () if ts_field in self.shared_series_fields else (
+                        len(ts),
+                    )
                     forking_dec_field = np.zeros(
                         shape=(self.enc_len, self.dec_len) + d3
                     )
@@ -158,8 +163,8 @@ def flatmap_transform(
             pad_indicator[:pad_length] = True
             out[self._past(self.is_pad_out)] = pad_indicator
 
-            # So far pad forecast_start_out not in use
-            out[self.forecast_start_out] = shift_timestamp(
+            # So far pad forecast_start not in use
+            out[FieldName.FORECAST_START] = shift_timestamp(
                 out[self.start_in], sampling_idx
             )
 
diff --git a/test/model/seq2seq/test_forking_sequence_splitter.py b/test/model/seq2seq/test_forking_sequence_splitter.py
index 42d2881bc6..d88338d0ab 100644
--- a/test/model/seq2seq/test_forking_sequence_splitter.py
+++ b/test/model/seq2seq/test_forking_sequence_splitter.py
@@ -124,7 +124,6 @@ def make_dataset(N, train_length):
                 train_sampler=TSplitSampler(),
                 enc_len=5,
                 dec_len=3,
-                target_in=FieldName.TARGET,
                 encoder_series_fields=[
                     FieldName.FEAT_AGE,
                     FieldName.FEAT_TIME,
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index 9da983308c..9cbd5ebfed 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -50,7 +50,7 @@ def test_accuracy(
         num_batches_per_epoch=100, hybridize=hybridize, quantiles=quantiles
     )
 
-    accuracy_test(Estimator, hyperparameters, accuracy=0.25)
+    accuracy_test(Estimator, hyperparameters, accuracy=0.20)
 
 
 @pytest.mark.parametrize("use_feat_dynamic_real", [True, False])

From a3bf607b4fd4c4fe1bc6b0b8a208ec61041fb6eb Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Tue, 21 Apr 2020 18:42:19 +0200
Subject: [PATCH 24/44] Addressing Jaspers Review

---
 src/gluonts/model/estimator.py                  | 13 ++++---------
 src/gluonts/model/seq2seq/_forking_estimator.py |  8 ++++----
 src/gluonts/model/seq2seq/_mq_dnn_estimator.py  | 12 ++++++------
 src/gluonts/model/seq2seq/_seq2seq_estimator.py |  2 +-
 4 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/gluonts/model/estimator.py b/src/gluonts/model/estimator.py
index 0668d19ce6..121bc61cb0 100644
--- a/src/gluonts/model/estimator.py
+++ b/src/gluonts/model/estimator.py
@@ -77,12 +77,9 @@ def derive_auto_fields(cls, train_iter):
     def from_inputs(cls, train_iter, **params):
         # auto_params usually include `use_feat_dynamic_real`, `use_feat_static_cat` and `cardinality`
         auto_params = cls.derive_auto_fields(train_iter)
-        # FIXME: probably params should take precedence over auto_params, since they were deliberately set,
-        #   however, on that case this method does not make sense, since if params says `use_feat_dynamic_real`=True
-        #   but `auto_params`=False, then this will lead to an error, since the appropriate data does not exist.
-        #   This the only context in which this method makes sense is when auto_params take precedence, which could
-        #   lead to overwriting of explicit parameters. In this case a warning should be issued.
-        return cls.from_hyperparameters(**auto_params, **params)
+        # user specified 'params' will take precedence:
+        params = {**auto_params, **params}
+        return cls.from_hyperparameters(params)
 
 
 class DummyEstimator(Estimator):
@@ -141,9 +138,7 @@ def from_hyperparameters(cls, **hyperparameters) -> "GluonEstimator":
             )
 
         try:
-            trainer = hyperparameters.get("trainer")
-            if not isinstance(trainer, Trainer):
-                trainer = from_hyperparameters(Trainer, **hyperparameters)
+            trainer = from_hyperparameters(Trainer, **hyperparameters)
 
             return cls(
                 **Model(**{**hyperparameters, "trainer": trainer}).__dict__
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index b4a3f73de0..981fbc9bae 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -139,14 +139,14 @@ def __init__(
         assert (
             prediction_length > 0
         ), "The value of `prediction_length` should be > 0"
-        assert (cardinality and use_feat_static_cat) or (
-            not (cardinality or use_feat_static_cat)
+        assert (
+            use_feat_static_cat or not cardinality
         ), "You should set `cardinality` if and only if `use_feat_static_cat=True`"
         assert cardinality is None or all(
-            [c > 0 for c in cardinality]
+            c > 0 for c in cardinality
         ), "Elements of `cardinality` should be > 0"
         assert embedding_dimension is None or all(
-            [e > 0 for e in embedding_dimension]
+            e > 0 for e in embedding_dimension
         ), "Elements of `embedding_dimension` should be > 0"
 
         self.encoder = encoder
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 1a896988b4..6dcd57766c 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -63,17 +63,17 @@ def __init__(
             d > 0 for d in decoder_mlp_dim_seq
         ), "Elements of `mlp_hidden_dimension_seq` should be > 0"
         assert channels_seq is None or all(
-            [d > 0 for d in channels_seq]
+            d > 0 for d in channels_seq
         ), "Elements of `channels_seq` should be > 0"
         assert dilation_seq is None or all(
-            [d > 0 for d in dilation_seq]
+            d > 0 for d in dilation_seq
         ), "Elements of `dilation_seq` should be > 0"
         # TODO: add support for kernel size=1
         assert kernel_size_seq is None or all(
-            [d > 1 for d in kernel_size_seq]
+            d > 1 for d in kernel_size_seq
         ), "Elements of `kernel_size_seq` should be > 0"
         assert quantiles is None or all(
-            [0 <= d <= 1 for d in quantiles]
+            0 <= d <= 1 for d in quantiles
         ), "Elements of `quantiles` should be >= 0 and <= 1"
 
         self.decoder_mlp_dim_seq = (
@@ -176,10 +176,10 @@ def __init__(
             prediction_length > 0
         ), f"Invalid prediction length: {prediction_length}."
         assert decoder_mlp_dim_seq is None or all(
-            [d > 0 for d in decoder_mlp_dim_seq]
+            d > 0 for d in decoder_mlp_dim_seq
         ), "Elements of `mlp_hidden_dimension_seq` should be > 0"
         assert quantiles is None or all(
-            [0 <= d <= 1 for d in quantiles]
+            0 <= d <= 1 for d in quantiles
         ), "Elements of `quantiles` should be >= 0 and <= 1"
 
         self.decoder_mlp_dim_seq = (
diff --git a/src/gluonts/model/seq2seq/_seq2seq_estimator.py b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
index cf9756300b..14712ec66f 100644
--- a/src/gluonts/model/seq2seq/_seq2seq_estimator.py
+++ b/src/gluonts/model/seq2seq/_seq2seq_estimator.py
@@ -74,7 +74,7 @@ def __init__(
             context_length is None or context_length > 0
         ), "The value of `context_length` should be > 0"
         assert quantiles is None or all(
-            [0 <= d <= 1 for d in quantiles]
+            0 <= d <= 1 for d in quantiles
         ), "Elements of `quantiles` should be >= 0 and <= 1"
 
         super().__init__(trainer=trainer)

From ed594aca0e7784f3c9db0d2194a90fc9facbf09e Mon Sep 17 00:00:00 2001
From: Jasper Schulz <jasper.b.schulz@googlemail.com>
Date: Wed, 22 Apr 2020 16:41:51 +0200
Subject: [PATCH 25/44] Update src/gluonts/model/estimator.py

---
 src/gluonts/model/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gluonts/model/estimator.py b/src/gluonts/model/estimator.py
index 121bc61cb0..2558965e79 100644
--- a/src/gluonts/model/estimator.py
+++ b/src/gluonts/model/estimator.py
@@ -79,7 +79,7 @@ def from_inputs(cls, train_iter, **params):
         auto_params = cls.derive_auto_fields(train_iter)
         # user specified 'params' will take precedence:
         params = {**auto_params, **params}
-        return cls.from_hyperparameters(params)
+        return cls.from_hyperparameters(**params)
 
 
 class DummyEstimator(Estimator):

From edc17fb9dda9bf4016d95affbf143db922d3407f Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Thu, 23 Apr 2020 20:58:09 +0200
Subject: [PATCH 26/44] Backwards compatibility and minor fixes.

---
 src/gluonts/dataset/stat.py                   | 10 ++++--
 src/gluonts/model/estimator.py                |  2 +-
 src/gluonts/model/predictor.py                |  5 ++-
 .../model/seq2seq/_forking_estimator.py       |  4 +++
 src/gluonts/transform/field.py                | 11 ++++++-
 test/model/seq2seq/test_model.py              | 32 +++++++++++++++++++
 6 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/src/gluonts/dataset/stat.py b/src/gluonts/dataset/stat.py
index 191c6f02b9..633757b219 100644
--- a/src/gluonts/dataset/stat.py
+++ b/src/gluonts/dataset/stat.py
@@ -139,7 +139,9 @@ def __eq__(self, other):
 
 # TODO: reorganize modules to avoid circular dependency
 # TODO: and substitute Any with Dataset
-def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
+def calculate_dataset_statistics(
+    ts_dataset: Any, backwards_compatibility=True
+) -> DatasetStatistics:
     """
     Computes the statistics of a given Dataset.
 
@@ -147,6 +149,9 @@ def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
     ----------
     ts_dataset
         Dataset of which to compute the statistics.
+    backwards_compatibility
+        Ensures backwards compatibility regarding the naming of certain Fields.
+        For example, 'dynamic_feat' is also accepted as FieldName.FEAT_DYNAMIC_REAL
 
     Returns
     -------
@@ -300,7 +305,7 @@ def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
             feat_dynamic_real = (
                 ts[FieldName.FEAT_DYNAMIC_REAL]
                 if FieldName.FEAT_DYNAMIC_REAL in ts
-                else None
+                else (ts["dynamic_feat"] if "dynamic_feat" in ts else None)
             )
 
             if feat_dynamic_real is None:
@@ -316,6 +321,7 @@ def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
                 if num_feat_dynamic_real is None:
                     # first num_feat_dynamic_real found
                     num_feat_dynamic_real = feat_dynamic_real.shape[0]
+                    # TODO: could assert that always same feat_dynamic_real key is used
                 else:
                     assert_data_error(
                         num_feat_dynamic_real == feat_dynamic_real.shape[0],
diff --git a/src/gluonts/model/estimator.py b/src/gluonts/model/estimator.py
index 121bc61cb0..2558965e79 100644
--- a/src/gluonts/model/estimator.py
+++ b/src/gluonts/model/estimator.py
@@ -79,7 +79,7 @@ def from_inputs(cls, train_iter, **params):
         auto_params = cls.derive_auto_fields(train_iter)
         # user specified 'params' will take precedence:
         params = {**auto_params, **params}
-        return cls.from_hyperparameters(params)
+        return cls.from_hyperparameters(**params)
 
 
 class DummyEstimator(Estimator):
diff --git a/src/gluonts/model/predictor.py b/src/gluonts/model/predictor.py
index a3031107de..3b474190db 100644
--- a/src/gluonts/model/predictor.py
+++ b/src/gluonts/model/predictor.py
@@ -162,8 +162,11 @@ def derive_auto_fields(cls, train_iter):
 
     @classmethod
     def from_inputs(cls, train_iter, **params):
+        # auto_params usually include `use_feat_dynamic_real`, `use_feat_static_cat` and `cardinality`
         auto_params = cls.derive_auto_fields(train_iter)
-        return cls.from_hyperparameters(**auto_params, **params)
+        # user specified 'params' will take precedence:
+        params = {**auto_params, **params}
+        return cls.from_hyperparameters(**params)
 
 
 class RepresentablePredictor(Predictor):
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 981fbc9bae..114a2f99a3 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -232,6 +232,10 @@ def create_transformation(self) -> Transformation:
             dynamic_feat_fields.append(FieldName.FEAT_AGE)
 
         if self.use_feat_dynamic_real:
+            # Backwards compatibility:
+            chain.append(
+                RenameFields({"dynamic_feat": FieldName.FEAT_DYNAMIC_REAL})
+            )
             dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
 
         # we need to make sure that there is always some dynamic input
diff --git a/src/gluonts/transform/field.py b/src/gluonts/transform/field.py
index 2e39320c22..e126051749 100644
--- a/src/gluonts/transform/field.py
+++ b/src/gluonts/transform/field.py
@@ -22,7 +22,7 @@
 
 class RenameFields(SimpleTransformation):
     """
-    Rename fields using a mapping
+    Rename fields using a mapping, if source field present.
 
     Parameters
     ----------
@@ -48,6 +48,15 @@ def transform(self, data: DataEntry):
 
 
 class RemoveFields(SimpleTransformation):
+    """"
+    Remove field names if present.
+
+    Parameters
+    ----------
+    field_names
+        List of names of the fields that will be removed
+    """
+
     @validated()
     def __init__(self, field_names: List[str]) -> None:
         self.field_names = field_names
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index 9cbd5ebfed..6fcd628945 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -93,3 +93,35 @@ def test_repr(Estimator, repr_test, hyperparameters):
 
 def test_serialize(Estimator, serialize_test, hyperparameters):
     serialize_test(Estimator, hyperparameters)
+
+
+def test_backwards_compatibility():
+    hps = {
+        "freq": "D",
+        "prediction_length": 3,
+        "quantiles": [0.5, 0.1],
+        "epochs": 3,
+        "num_batches_per_epoch": 3,
+        "use_feat_dynamic_real": True,
+    }
+
+    dataset_train, dataset_test = make_dummy_datasets_with_features(
+        cardinality=[3, 10],
+        num_feat_dynamic_real=2,
+        freq=hps["freq"],
+        prediction_length=hps["prediction_length"],
+    )
+
+    for entry in dataset_train:
+        entry["dynamic_feat"] = entry["feat_dynamic_real"]
+        del entry["feat_dynamic_real"]
+
+    for entry in dataset_test:
+        entry["dynamic_feat"] = entry["feat_dynamic_real"]
+        del entry["feat_dynamic_real"]
+
+    estimator = MQCNNEstimator.from_inputs(dataset_train, **hps)
+
+    predictor = estimator.train(dataset_train)
+    forecasts = list(predictor.predict(dataset_test))
+    assert len(forecasts) == len(dataset_test)

From 32271e7e2b439deba477ababa60d424149d83529 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Thu, 30 Apr 2020 17:39:52 +0200
Subject: [PATCH 27/44] Improvements to model thoughput.

---
 src/gluonts/dataset/loader.py                   |  5 +++--
 src/gluonts/model/seq2seq/_forking_estimator.py | 15 ++++-----------
 src/gluonts/model/seq2seq/_transform.py         |  4 ++++
 src/gluonts/transform/feature.py                | 10 +++++-----
 src/gluonts/transform/field.py                  |  3 +--
 5 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/gluonts/dataset/loader.py b/src/gluonts/dataset/loader.py
index 0b749bdcc5..ea363c43c8 100644
--- a/src/gluonts/dataset/loader.py
+++ b/src/gluonts/dataset/loader.py
@@ -133,7 +133,7 @@ class TrainDataLoader(DataLoader):
         Note that using large prefetching batch will provide smoother bootstrapping performance,
         but will consume more shared_memory. Using smaller number may forfeit the purpose of using
         multiple worker processes, try reduce `num_workers` in this case.
-        By default it defaults to `num_workers * 2`.
+        By default `num_workers * 2`.
     dtype
         Floating point type to use. Default is np.float32.
     shuffle_for_training
@@ -141,6 +141,7 @@ class TrainDataLoader(DataLoader):
     num_batches_for_shuffling
         The effective number of batches among which samples are shuffled. If num_batches_for_shuffling = 8 and
         batch_size = 8 then the next batch will be randomly sampled from about 64 samples.
+        By default 1, since this can have a hit on throughput.
     """
 
     def __init__(
@@ -154,7 +155,7 @@ def __init__(
         num_prefetch: Optional[int] = None,
         dtype: DType = np.float32,
         shuffle_for_training: bool = True,
-        num_batches_for_shuffling: int = 8,
+        num_batches_for_shuffling: int = 1,
         **kwargs
     ) -> None:
         assert dataset, "empty dataset"
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 114a2f99a3..0ae5775934 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -195,9 +195,6 @@ def create_transformation(self) -> Transformation:
         chain.extend(
             [
                 RemoveFields(field_names=remove_field_names),
-                AsNumpyArray(
-                    field=FieldName.TARGET, expected_ndim=1, dtype=self.dtype
-                ),
                 AddObservedValuesIndicator(
                     target_field=FieldName.TARGET,
                     output_field=FieldName.OBSERVED_VALUES,
@@ -268,15 +265,11 @@ def create_transformation(self) -> Transformation:
 
         if not self.use_feat_static_cat:
             chain.append(
-                SetField(output_field=FieldName.FEAT_STATIC_CAT, value=[0.0]),
+                SetField(
+                    output_field=FieldName.FEAT_STATIC_CAT,
+                    value=np.array([0.0]),
+                ),
             )
-        chain.append(
-            AsNumpyArray(
-                field=FieldName.FEAT_STATIC_CAT,
-                expected_ndim=1,
-                dtype=self.dtype,
-            ),
-        )
 
         # --- SAMPLE AND CUT THE TIME-SERIES ---
 
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index 9629efd873..717f99d383 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -116,6 +116,8 @@ def flatmap_transform(
             # ensure start index is not negative
             start_idx = max(0, sampling_idx - self.enc_len)
 
+            # irrelevant data should have been removed by now in the
+            # transformation chain, so copying everything is ok
             out = data.copy()
 
             for ts_field in list(ts_fields_counter.keys()):
@@ -149,6 +151,8 @@ def flatmap_transform(
                     )
 
                     skip = max(0, self.enc_len - sampling_idx)
+                    # This section takes by far the longest time computationally:
+                    # This scales linearly in self.enc_len and linearly in self.dec_len
                     for dec_field, idx in zip(
                         forking_dec_field[skip:],
                         range(start_idx + 1, start_idx + self.enc_len + 1),
diff --git a/src/gluonts/transform/feature.py b/src/gluonts/transform/feature.py
index d382a305a0..44afa8e99a 100644
--- a/src/gluonts/transform/feature.py
+++ b/src/gluonts/transform/feature.py
@@ -68,15 +68,15 @@ def __init__(
 
     def transform(self, data: DataEntry) -> DataEntry:
         value = data[self.target_field]
-        nan_indices = np.where(np.isnan(value))
         nan_entries = np.isnan(value)
 
         if self.convert_nans:
-            value[nan_indices] = self.dummy_value
+            value[np.where(nan_entries)] = self.dummy_value
+            data[self.target_field] = value
 
-        data[self.target_field] = value
-        # Invert bool array so that missing values are zeros and store as float
-        data[self.output_field] = np.invert(nan_entries).astype(self.dtype)
+        data[self.output_field] = np.invert(
+            nan_entries, out=nan_entries
+        ).astype(self.dtype, copy=False)
         return data
 
 
diff --git a/src/gluonts/transform/field.py b/src/gluonts/transform/field.py
index e126051749..d390b65bbf 100644
--- a/src/gluonts/transform/field.py
+++ b/src/gluonts/transform/field.py
@@ -63,8 +63,7 @@ def __init__(self, field_names: List[str]) -> None:
 
     def transform(self, data: DataEntry) -> DataEntry:
         for k in self.field_names:
-            if k in data.keys():
-                del data[k]
+            data.pop(k, None)
         return data
 
 

From 7920a03a869f888693cc3a810ab6971015a61da6 Mon Sep 17 00:00:00 2001
From: Bernie Wang <yuyawang@amazon.com>
Date: Fri, 1 May 2020 23:21:23 -0700
Subject: [PATCH 28/44] allow decoding features

---
 src/gluonts/block/decoder.py                  | 96 +++++++++++++++++++
 .../model/seq2seq/_forking_estimator.py       |  1 +
 2 files changed, 97 insertions(+)

diff --git a/src/gluonts/block/decoder.py b/src/gluonts/block/decoder.py
index deabc87175..5001c6dacb 100644
--- a/src/gluonts/block/decoder.py
+++ b/src/gluonts/block/decoder.py
@@ -138,6 +138,102 @@ def hybrid_forward(
         return mlp_output
 
 
+class ForkingMLPDecoderWithFutureFeat(Seq2SeqDecoder):
+    """
+    Multilayer perceptron decoder for sequence-to-sequence models.
+
+    See [WTN+17]_ for details.
+
+    Parameters
+    ----------
+    dec_len
+        length of the decoder (usually the number of forecasted time steps).
+
+    final_dim
+        dimensionality of the output per time step (number of predicted
+        quantiles).
+
+    hidden_dimension_sequence
+        number of hidden units for each MLP layer.
+    """
+
+    @validated()
+    def __init__(
+        self,
+        dec_len: int,
+        final_dim: int,
+        hidden_dimension_sequence: List[int] = list([]),
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        self.dec_len = dec_len
+        self.final_dims = final_dim
+
+        with self.name_scope():
+            self.model = nn.HybridSequential()
+
+            for layer_no, layer_dim in enumerate(hidden_dimension_sequence):
+                layer = nn.Dense(
+                    dec_len * layer_dim,
+                    flatten=False,
+                    activation="relu",
+                    prefix=f"mlp_{layer_no:#02d}'_",
+                )
+                self.model.add(layer)
+
+            layer = nn.Dense(
+                dec_len * final_dim,
+                flatten=False,
+                activation="softrelu",
+                prefix=f"mlp_{len(hidden_dimension_sequence):#02d}'_",
+            )
+            self.model.add(layer)
+
+    # TODO: add support for static input at some point
+    def hybrid_forward(
+        self,
+        F,
+        dynamic_input: Tensor,
+        dynamic_input_decode: Tensor,
+        static_input: Tensor = None,
+    ) -> Tensor:
+        """
+        ForkingMLPDecoder forward call.
+
+        Parameters
+        ----------
+        F
+            A module that can either refer to the Symbol API or the NDArray
+            API in MXNet.
+
+        dynamic_input
+            dynamic_features, shape (batch_size, encoder_length, num_features)
+            or (N, T, C).
+
+        dynamic_input
+            dynamic_features, shape (batch_size, encoder_length, decoder_length, num_features)
+            or (N, T, T, C).
+
+        static_input
+            not used in this decoder.
+
+        Returns
+        -------
+        Tensor
+            mlp output, shape (batch_size, encoder_length, dec_len, final_dims).
+
+        """
+        mlp_output = self.model(dynamic_input)
+        mlp_output = mlp_output.reshape(
+            shape=(0, 0, self.dec_len, self.final_dims)
+        )
+        mlp_output = F.concat(
+            mlp_output, dynamic_input_decode, dim=-1
+        )  # TODO: would -1 work?
+        return mlp_output
+
+
 class OneShotDecoder(Seq2SeqDecoder):
     """
     OneShotDecoder.
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 0ae5775934..7211fa9904 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -281,6 +281,7 @@ def create_transformation(self) -> Transformation:
                 enc_len=self.context_length,
                 dec_len=self.prediction_length,
                 encoder_series_fields=[FieldName.FEAT_DYNAMIC],
+                # decoder_series_fileds=[FieldName.FEAT_TIME],
                 shared_series_fields=[FieldName.OBSERVED_VALUES],
             ),
         )

From fdb011df7fd395e932d69ec9f2a4782f9c3e5798 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Mon, 4 May 2020 16:55:58 +0200
Subject: [PATCH 29/44] Temprorariliy added unconditional caching.

---
 src/gluonts/dataset/common.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gluonts/dataset/common.py b/src/gluonts/dataset/common.py
index 7a5fba678d..c9c8544c50 100644
--- a/src/gluonts/dataset/common.py
+++ b/src/gluonts/dataset/common.py
@@ -195,7 +195,8 @@ def __init__(
         path: Path,
         freq: str,
         one_dim_target: bool = True,
-        cache: bool = False,
+        # FIXME: only changed this temporarily
+        cache: bool = True,
     ) -> None:
         self.cache = cache
         self.path = path

From b2800467655d83349a4a9218c37a9ac4e3faf2cf Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Mon, 4 May 2020 18:21:50 +0200
Subject: [PATCH 30/44] Enabled multiprocessing by default.

---
 src/gluonts/dataset/parallelized_loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gluonts/dataset/parallelized_loader.py b/src/gluonts/dataset/parallelized_loader.py
index 7121204e02..c4fc342512 100644
--- a/src/gluonts/dataset/parallelized_loader.py
+++ b/src/gluonts/dataset/parallelized_loader.py
@@ -576,8 +576,8 @@ def __init__(
 
         self.dtype = dtype
 
-        # TODO: switch to default multiprocessing.cpu_count() here
-        default_num_workers = 0
+        # FIXME: switched permanently on for MQCNN
+        default_num_workers = multiprocessing.cpu_count()
         self.num_workers = (
             num_workers
             if num_workers is not None

From 2ed3c1957d1fde2fc8a59e5616e70636822d196b Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Mon, 4 May 2020 19:59:50 +0200
Subject: [PATCH 31/44] Standartized comments.

---
 src/gluonts/dataset/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gluonts/dataset/common.py b/src/gluonts/dataset/common.py
index c9c8544c50..544e448535 100644
--- a/src/gluonts/dataset/common.py
+++ b/src/gluonts/dataset/common.py
@@ -195,7 +195,7 @@ def __init__(
         path: Path,
         freq: str,
         one_dim_target: bool = True,
-        # FIXME: only changed this temporarily
+        # FIXME: switched permanently on for MQCNN
         cache: bool = True,
     ) -> None:
         self.cache = cache

From 1c57e45d504bb7f9ba18cac40dc529b96a596cdd Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Mon, 4 May 2020 20:40:34 +0200
Subject: [PATCH 32/44] Small bug fixes.

---
 src/gluonts/dataset/parallelized_loader.py | 2 +-
 src/gluonts/model/seq2seq/_transform.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gluonts/dataset/parallelized_loader.py b/src/gluonts/dataset/parallelized_loader.py
index c4fc342512..67ddbae515 100644
--- a/src/gluonts/dataset/parallelized_loader.py
+++ b/src/gluonts/dataset/parallelized_loader.py
@@ -577,7 +577,7 @@ def __init__(
         self.dtype = dtype
 
         # FIXME: switched permanently on for MQCNN
-        default_num_workers = multiprocessing.cpu_count()
+        default_num_workers = int(multiprocessing.cpu_count() * (3 / 5))
         self.num_workers = (
             num_workers
             if num_workers is not None
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index 717f99d383..a25cf10ef7 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -32,7 +32,7 @@ def pad_to_size(xs, size):
         return xs
 
     pad_width = ([(0, 0)] * (xs.ndim - 1)) + [(pad_length, 0)]
-    return np.pad(xs, pad_width)
+    return np.pad(xs, mode="constant", pad_width=pad_width)
 
 
 class ForkingSequenceSplitter(FlatMapTransformation):

From 4fce760dde8e6c8bd5f1da2e92c4c9db010f712d Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Tue, 5 May 2020 20:31:23 +0200
Subject: [PATCH 33/44] making caching and multiprocessing always on a local
 change

---
 src/gluonts/dataset/common.py                 |  3 +-
 src/gluonts/dataset/parallelized_loader.py    |  4 +--
 .../model/seq2seq/_mq_dnn_estimator.py        | 31 +++++++++++++++++++
 .../entry_point_scripts/train_entry_point.py  |  2 +-
 test/dataset/test_variable_length.py          |  6 ++--
 test/model/seq2seq/test_model.py              |  3 ++
 6 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/src/gluonts/dataset/common.py b/src/gluonts/dataset/common.py
index 544e448535..7a5fba678d 100644
--- a/src/gluonts/dataset/common.py
+++ b/src/gluonts/dataset/common.py
@@ -195,8 +195,7 @@ def __init__(
         path: Path,
         freq: str,
         one_dim_target: bool = True,
-        # FIXME: switched permanently on for MQCNN
-        cache: bool = True,
+        cache: bool = False,
     ) -> None:
         self.cache = cache
         self.path = path
diff --git a/src/gluonts/dataset/parallelized_loader.py b/src/gluonts/dataset/parallelized_loader.py
index 67ddbae515..7121204e02 100644
--- a/src/gluonts/dataset/parallelized_loader.py
+++ b/src/gluonts/dataset/parallelized_loader.py
@@ -576,8 +576,8 @@ def __init__(
 
         self.dtype = dtype
 
-        # FIXME: switched permanently on for MQCNN
-        default_num_workers = int(multiprocessing.cpu_count() * (3 / 5))
+        # TODO: switch to default multiprocessing.cpu_count() here
+        default_num_workers = 0
         self.num_workers = (
             num_workers
             if num_workers is not None
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 6dcd57766c..6d25aef9b2 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -12,6 +12,7 @@
 # permissions and limitations under the License.
 
 # Standard library imports
+import multiprocessing
 from typing import List, Optional
 
 # Third-party imports
@@ -19,6 +20,7 @@
 import mxnet as mx
 
 # First-party imports
+from gluonts.dataset.common import Dataset, ListDataset
 from gluonts.dataset.stat import calculate_dataset_statistics
 from gluonts.block.decoder import ForkingMLPDecoder
 from gluonts.block.encoder import HierarchicalCausalConv1DEncoder, RNNEncoder
@@ -154,6 +156,35 @@ def derive_auto_fields(cls, train_iter):
             "cardinality": [len(cats) for cats in stats.feat_static_cat],
         }
 
+    # FIXME: for now we always want the dataset to be cached and utilize multiprocessing.
+    def train(
+        self,
+        training_data: Dataset,
+        validation_data: Optional[Dataset] = None,
+        num_workers: Optional[int] = None,
+        **kwargs,
+    ):
+        cached_train_data = ListDataset(
+            data_iter=list(training_data), freq=self.freq
+        )
+        cached_validation_data = (
+            None
+            if validation_data is None
+            else ListDataset(data_iter=list(validation_data), freq=self.freq)
+        )
+        num_workers = (
+            num_workers
+            if num_workers is not None
+            else int(multiprocessing.cpu_count() * (1 / 2))
+        )
+
+        return super().train(
+            training_data=cached_train_data,
+            validation_data=cached_validation_data,
+            num_workers=num_workers,
+            **kwargs,
+        )
+
 
 class MQRNNEstimator(ForkingSeq2SeqEstimator):
     """
diff --git a/src/gluonts/nursery/sagemaker_sdk/entry_point_scripts/train_entry_point.py b/src/gluonts/nursery/sagemaker_sdk/entry_point_scripts/train_entry_point.py
index 62b2cf535e..5f57b79a89 100644
--- a/src/gluonts/nursery/sagemaker_sdk/entry_point_scripts/train_entry_point.py
+++ b/src/gluonts/nursery/sagemaker_sdk/entry_point_scripts/train_entry_point.py
@@ -73,7 +73,7 @@ def train(arguments):
     evaluator = Evaluator(quantiles=eval(arguments.quantiles))
 
     agg_metrics, item_metrics = evaluator(
-        ts_it, forecast_it, num_series=len(dataset.test)
+        ts_it, forecast_it, num_series=len(list(dataset.test))
     )
 
     # required for metric tracking.
diff --git a/test/dataset/test_variable_length.py b/test/dataset/test_variable_length.py
index 5ac7a8d79a..8734e141a0 100644
--- a/test/dataset/test_variable_length.py
+++ b/test/dataset/test_variable_length.py
@@ -100,9 +100,11 @@ def train_loader(
         kwargs.update(override_args)
 
         if is_train:
-            return TrainDataLoader(num_batches_per_epoch=22, **kwargs)
+            return TrainDataLoader(
+                num_batches_per_epoch=22, num_workers=0, **kwargs
+            )
         else:
-            return InferenceDataLoader(**kwargs)
+            return InferenceDataLoader(num_workers=0, **kwargs)
 
     return train_loader
 
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index 6fcd628945..9deae10319 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -31,6 +31,7 @@ def hyperparameters(dsinfo):
         num_batches_per_epoch=1,
         quantiles=[0.1, 0.5, 0.9],
         use_symbol_block_predictor=True,
+        num_workers=0,
     )
 
 
@@ -71,6 +72,7 @@ def test_mqcnn_covariate_smoke_test(
         "add_time_feature": add_time_feature,
         "add_age_feature": add_age_feature,
         "hybridize": hybridize,
+        "num_workers": 0,
     }
 
     dataset_train, dataset_test = make_dummy_datasets_with_features(
@@ -103,6 +105,7 @@ def test_backwards_compatibility():
         "epochs": 3,
         "num_batches_per_epoch": 3,
         "use_feat_dynamic_real": True,
+        "num_workers": 0,
     }
 
     dataset_train, dataset_test = make_dummy_datasets_with_features(

From 12673bc1bc76e8d86112ae2a3e141b65f379b609 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Tue, 5 May 2020 20:35:02 +0200
Subject: [PATCH 34/44] mend

---
 src/gluonts/model/seq2seq/_mq_dnn_estimator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 6d25aef9b2..88dfb695bc 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -162,6 +162,7 @@ def train(
         training_data: Dataset,
         validation_data: Optional[Dataset] = None,
         num_workers: Optional[int] = None,
+        num_prefetch: Optional[int] = None,
         **kwargs,
     ):
         cached_train_data = ListDataset(
@@ -182,6 +183,7 @@ def train(
             training_data=cached_train_data,
             validation_data=cached_validation_data,
             num_workers=num_workers,
+            num_prefetch=num_prefetch,
             **kwargs,
         )
 

From a020a4775a554e15a7d9be348b3e73e74e8b2e8c Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Fri, 8 May 2020 12:00:47 +0200
Subject: [PATCH 35/44] Backwards compatibility fix.

---
 src/gluonts/block/decoder.py                  | 37 +++++++++----------
 src/gluonts/dataset/field_names.py            |  1 +
 src/gluonts/dataset/stat.py                   | 34 +++++++----------
 src/gluonts/model/seq2seq/_forking_network.py | 10 ++++-
 src/gluonts/model/seq2seq/_transform.py       |  5 +++
 test/model/deepstate/test_model.py            |  2 +-
 test/model/seq2seq/test_model.py              | 18 +++++----
 7 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/src/gluonts/block/decoder.py b/src/gluonts/block/decoder.py
index 5001c6dacb..6bbc816b2c 100644
--- a/src/gluonts/block/decoder.py
+++ b/src/gluonts/block/decoder.py
@@ -77,7 +77,7 @@ def __init__(
         self,
         dec_len: int,
         final_dim: int,
-        hidden_dimension_sequence: List[int] = list([]),
+        hidden_dimension_sequence: List[int] = [],
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -117,11 +117,10 @@ def hybrid_forward(
         F
             A module that can either refer to the Symbol API or the NDArray
             API in MXNet.
-
         dynamic_input
-            dynamic_features, shape (batch_size, sequence_length, num_features)
-            or (N, T, C).
-
+            dynamic_features, shape (batch_size, sequence_length, num_features) or (N, T, C)
+            where sequence_length is equal to the encoder length, and num_features is equal
+            to channel_seq[-1] for the MQCNN for example.
         static_input
             not used in this decoder.
 
@@ -162,7 +161,7 @@ def __init__(
         self,
         dec_len: int,
         final_dim: int,
-        hidden_dimension_sequence: List[int] = list([]),
+        hidden_dimension_sequence: List[int] = [],
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -194,9 +193,9 @@ def __init__(
     def hybrid_forward(
         self,
         F,
+        dynamic_output_encoder: Tensor,
         dynamic_input: Tensor,
-        dynamic_input_decode: Tensor,
-        static_input: Tensor = None,
+        static_output_encoder: Tensor = None,
     ) -> Tensor:
         """
         ForkingMLPDecoder forward call.
@@ -206,16 +205,14 @@ def hybrid_forward(
         F
             A module that can either refer to the Symbol API or the NDArray
             API in MXNet.
-
+        dynamic_input_encoder
+            dynamic_features, shape (batch_size, sequence_length, num_features) or (N, T, C)
+            where sequence_length is equal to the encoder length, and num_features is equal
+            to channel_seq[-1] for the MQCNN for example.
         dynamic_input
-            dynamic_features, shape (batch_size, encoder_length, num_features)
+            dynamic_features, shape (batch_size, encoder_length, decoder_length, num_features_02)
             or (N, T, C).
-
-        dynamic_input
-            dynamic_features, shape (batch_size, encoder_length, decoder_length, num_features)
-            or (N, T, T, C).
-
-        static_input
+        static_input_encoder
             not used in this decoder.
 
         Returns
@@ -224,13 +221,13 @@ def hybrid_forward(
             mlp output, shape (batch_size, encoder_length, dec_len, final_dims).
 
         """
-        mlp_output = self.model(dynamic_input)
+        mlp_output = self.model(dynamic_output_encoder)
         mlp_output = mlp_output.reshape(
             shape=(0, 0, self.dec_len, self.final_dims)
         )
-        mlp_output = F.concat(
-            mlp_output, dynamic_input_decode, dim=-1
-        )  # TODO: would -1 work?
+        # mlp_output = F.concat(
+        #     mlp_output, dynamic_input, dim=-1
+        # )  # TODO: would -1 work?
         return mlp_output
 
 
diff --git a/src/gluonts/dataset/field_names.py b/src/gluonts/dataset/field_names.py
index 0e0a6ff7f8..d686c4f26f 100644
--- a/src/gluonts/dataset/field_names.py
+++ b/src/gluonts/dataset/field_names.py
@@ -27,6 +27,7 @@ class FieldName:
     FEAT_STATIC_REAL = "feat_static_real"
     FEAT_DYNAMIC_CAT = "feat_dynamic_cat"
     FEAT_DYNAMIC_REAL = "feat_dynamic_real"
+    FEAT_DYNAMIC_REAL_LEGACY = "dynamic_feat"
 
     FEAT_DYNAMIC = "feat_dynamic"
 
diff --git a/src/gluonts/dataset/stat.py b/src/gluonts/dataset/stat.py
index 633757b219..bc94d2604d 100644
--- a/src/gluonts/dataset/stat.py
+++ b/src/gluonts/dataset/stat.py
@@ -139,9 +139,7 @@ def __eq__(self, other):
 
 # TODO: reorganize modules to avoid circular dependency
 # TODO: and substitute Any with Dataset
-def calculate_dataset_statistics(
-    ts_dataset: Any, backwards_compatibility=True
-) -> DatasetStatistics:
+def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
     """
     Computes the statistics of a given Dataset.
 
@@ -149,9 +147,6 @@ def calculate_dataset_statistics(
     ----------
     ts_dataset
         Dataset of which to compute the statistics.
-    backwards_compatibility
-        Ensures backwards compatibility regarding the naming of certain Fields.
-        For example, 'dynamic_feat' is also accepted as FieldName.FEAT_DYNAMIC_REAL
 
     Returns
     -------
@@ -276,14 +271,14 @@ def calculate_dataset_statistics(
             else:
                 if num_feat_dynamic_cat is None:
                     # first num_feat_dynamic_cat found
-                    num_feat_dynamic_cat = feat_dynamic_cat.shape[0]
+                    num_feat_dynamic_cat = len(feat_dynamic_cat)
                 else:
                     assert_data_error(
-                        num_feat_dynamic_cat == feat_dynamic_cat.shape[0],
+                        num_feat_dynamic_cat == len(feat_dynamic_cat),
                         "Found instances with different number of features in "
                         "feat_dynamic_cat, found one with {} and another with {}.",
                         num_feat_dynamic_cat,
-                        feat_dynamic_cat.shape[0],
+                        len(feat_dynamic_cat),
                     )
 
                 assert_data_error(
@@ -291,7 +286,7 @@ def calculate_dataset_statistics(
                     "Features values have to be finite and cannot exceed single "
                     "precision floating point range.",
                 )
-                num_feat_dynamic_cat_time_steps = feat_dynamic_cat.shape[1]
+                num_feat_dynamic_cat_time_steps = len(feat_dynamic_cat[0])
                 assert_data_error(
                     num_feat_dynamic_cat_time_steps == len(target),
                     "Each feature in feat_dynamic_cat has to have the same length as "
@@ -302,11 +297,11 @@ def calculate_dataset_statistics(
                 )
 
             # FEAT_DYNAMIC_REAL
-            feat_dynamic_real = (
-                ts[FieldName.FEAT_DYNAMIC_REAL]
-                if FieldName.FEAT_DYNAMIC_REAL in ts
-                else (ts["dynamic_feat"] if "dynamic_feat" in ts else None)
-            )
+            feat_dynamic_real = None
+            if FieldName.FEAT_DYNAMIC_REAL in ts:
+                feat_dynamic_real = ts[FieldName.FEAT_DYNAMIC_REAL]
+            elif FieldName.FEAT_DYNAMIC_REAL_LEGACY in ts:
+                feat_dynamic_real = ts[FieldName.FEAT_DYNAMIC_REAL_LEGACY]
 
             if feat_dynamic_real is None:
                 # feat_dynamic_real not found, check it was the first ts we encounter or
@@ -320,15 +315,14 @@ def calculate_dataset_statistics(
             else:
                 if num_feat_dynamic_real is None:
                     # first num_feat_dynamic_real found
-                    num_feat_dynamic_real = feat_dynamic_real.shape[0]
-                    # TODO: could assert that always same feat_dynamic_real key is used
+                    num_feat_dynamic_real = len(feat_dynamic_real)
                 else:
                     assert_data_error(
-                        num_feat_dynamic_real == feat_dynamic_real.shape[0],
+                        num_feat_dynamic_real == len(feat_dynamic_real),
                         "Found instances with different number of features in "
                         "feat_dynamic_real, found one with {} and another with {}.",
                         num_feat_dynamic_real,
-                        feat_dynamic_real.shape[0],
+                        len(feat_dynamic_real),
                     )
 
                 assert_data_error(
@@ -336,7 +330,7 @@ def calculate_dataset_statistics(
                     "Features values have to be finite and cannot exceed single "
                     "precision floating point range.",
                 )
-                num_feat_dynamic_real_time_steps = feat_dynamic_real.shape[1]
+                num_feat_dynamic_real_time_steps = len(feat_dynamic_real[0])
                 assert_data_error(
                     num_feat_dynamic_real_time_steps == len(target),
                     "Each feature in feat_dynamic_real has to have the same length as "
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 5e783fa769..5b55af3db7 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -145,8 +145,11 @@ def get_decoder_network_output(
         )
 
         # arguments: dynamic_input, static_input
+        # TODO: optimize what we pass to the decoder for the prediction case,
+        #  where we we only need to pass the encoder output for the last time step
         dec_output = self.decoder(dec_input_dynamic, dec_input_static)
 
+        # the output shape should be: (batch_size, enc_len, dec_len, final_dims)
         return dec_output
 
 
@@ -159,7 +162,7 @@ def hybrid_forward(
         past_target: Tensor,
         past_feat_dynamic: Tensor,
         feat_static_cat: Tensor,
-        past_observed_values: Tensor,  # FOR SOME REASON NOT USED???
+        past_observed_values: Tensor,
         future_observed_values: Tensor,
     ) -> Tensor:
         """
@@ -175,6 +178,9 @@ def hybrid_forward(
             shape (batch_size, encoder_length, num_feature_static_cat)
         past_feat_dynamic
             shape (batch_size, encoder_length, num_feature_dynamic)
+        future_feat_dynamic
+            shape (batch_size, encoder_length, decoder_length, num_feature_dynamic)
+            # or shape (batch_size, decoder_length, num_feature_dynamic) replicated for each of the encoder steps
         past_observed_values: Tensor
             shape (batch_size, encoder_length, 1)
         future_observed_values: Tensor
@@ -240,8 +246,10 @@ def hybrid_forward(
             past_observed_values,
         )
 
+        # We only care about the output of the decoder for the last time step
         fcst_output = F.slice_axis(dec_output, axis=1, begin=-1, end=None)
         fcst_output = F.squeeze(fcst_output, axis=1)
+
         predictions = self.quantile_proj(fcst_output).swapaxes(2, 1)
 
         return predictions
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index a25cf10ef7..bc4ecf4a7e 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -142,6 +142,11 @@ def flatmap_transform(
                 if not is_train and (ts_field in self.shared_series_fields):
                     continue
 
+                # TODO: do the same to the future dynamic feat as we do to the target
+
+                # This is were some of the forking magic happens:
+                # For each of the encoder_len time-steps at which the decoder is applied we slice the
+                # corresponding inputs called decoder_fields to the appropriate dec_len
                 if ts_field in decoder_fields:
                     d3: Any = () if ts_field in self.shared_series_fields else (
                         len(ts),
diff --git a/test/model/deepstate/test_model.py b/test/model/deepstate/test_model.py
index 40bf602146..2a6216e1b4 100644
--- a/test/model/deepstate/test_model.py
+++ b/test/model/deepstate/test_model.py
@@ -35,7 +35,7 @@ def hyperparameters(dsinfo):
 
 
 def test_accuracy(accuracy_test, hyperparameters):
-    hyperparameters.update(num_batches_per_epoch=100)
+    hyperparameters.update(num_batches_per_epoch=200)
 
     accuracy_test(DeepStateEstimator, hyperparameters, accuracy=0.5)
 
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index 9deae10319..df54f1dbb8 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -115,13 +115,17 @@ def test_backwards_compatibility():
         prediction_length=hps["prediction_length"],
     )
 
-    for entry in dataset_train:
-        entry["dynamic_feat"] = entry["feat_dynamic_real"]
-        del entry["feat_dynamic_real"]
-
-    for entry in dataset_test:
-        entry["dynamic_feat"] = entry["feat_dynamic_real"]
-        del entry["feat_dynamic_real"]
+    for i in range(len(dataset_train)):
+        dataset_train.list_data[i]["dynamic_feat"] = dataset_train.list_data[
+            i
+        ]["feat_dynamic_real"]
+        del dataset_train.list_data[i]["feat_dynamic_real"]
+
+    for i in range(len(dataset_test)):
+        dataset_test.list_data[i]["dynamic_feat"] = dataset_test.list_data[i][
+            "feat_dynamic_real"
+        ]
+        del dataset_test.list_data[i]["feat_dynamic_real"]
 
     estimator = MQCNNEstimator.from_inputs(dataset_train, **hps)
 

From e9787819bfda90095fb52e7bed0c25f3f7b4e5ec Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Fri, 8 May 2020 12:17:08 +0200
Subject: [PATCH 36/44] Removing deepstate noise.

---
 test/model/deepstate/test_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/model/deepstate/test_model.py b/test/model/deepstate/test_model.py
index 2a6216e1b4..fd17230b14 100644
--- a/test/model/deepstate/test_model.py
+++ b/test/model/deepstate/test_model.py
@@ -20,7 +20,7 @@
 def hyperparameters(dsinfo):
     return dict(
         ctx="cpu",
-        epochs=1,
+        epochs=3,
         learning_rate=1e-2,
         hybridize=False,
         num_cells=2,
@@ -35,7 +35,7 @@ def hyperparameters(dsinfo):
 
 
 def test_accuracy(accuracy_test, hyperparameters):
-    hyperparameters.update(num_batches_per_epoch=200)
+    hyperparameters.update(num_batches_per_epoch=100)
 
     accuracy_test(DeepStateEstimator, hyperparameters, accuracy=0.5)
 

From af5eb104af680a40a10b1b419dd32528e7648bbb Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Fri, 8 May 2020 12:41:11 +0200
Subject: [PATCH 37/44] Removing deepstate noise.

---
 test/dataset/test_loader.py        | 2 +-
 test/model/deepstate/test_model.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/dataset/test_loader.py b/test/dataset/test_loader.py
index e2503e6d5c..d527da54a0 100644
--- a/test/dataset/test_loader.py
+++ b/test/dataset/test_loader.py
@@ -109,7 +109,7 @@ def test_io_speed() -> None:
 
     # name of method, loading function and min allowed throughput
     fixtures = [
-        ("baseline", baseline, 70_000),
+        ("baseline", baseline, 65_000),
         # ('json.loads', load_json, xxx),
         ("ujson.loads", load_ujson, 20_000),
         ("JsonLinesFile", load_json_lines_file, 10_000),
diff --git a/test/model/deepstate/test_model.py b/test/model/deepstate/test_model.py
index fd17230b14..0493d592e6 100644
--- a/test/model/deepstate/test_model.py
+++ b/test/model/deepstate/test_model.py
@@ -20,7 +20,7 @@
 def hyperparameters(dsinfo):
     return dict(
         ctx="cpu",
-        epochs=3,
+        epochs=1,
         learning_rate=1e-2,
         hybridize=False,
         num_cells=2,
@@ -37,7 +37,7 @@ def hyperparameters(dsinfo):
 def test_accuracy(accuracy_test, hyperparameters):
     hyperparameters.update(num_batches_per_epoch=100)
 
-    accuracy_test(DeepStateEstimator, hyperparameters, accuracy=0.5)
+    accuracy_test(DeepStateEstimator, hyperparameters, accuracy=0.75)
 
 
 def test_repr(repr_test, hyperparameters):

From e3ad55457950e2c2b4729035d972aced414a24cd Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Fri, 8 May 2020 13:14:27 +0200
Subject: [PATCH 38/44] Adjusting read speed baseline for windows.

---
 test/dataset/test_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/dataset/test_loader.py b/test/dataset/test_loader.py
index d527da54a0..fede335e5d 100644
--- a/test/dataset/test_loader.py
+++ b/test/dataset/test_loader.py
@@ -109,7 +109,7 @@ def test_io_speed() -> None:
 
     # name of method, loading function and min allowed throughput
     fixtures = [
-        ("baseline", baseline, 65_000),
+        ("baseline", baseline, 60_000),
         # ('json.loads', load_json, xxx),
         ("ujson.loads", load_ujson, 20_000),
         ("JsonLinesFile", load_json_lines_file, 10_000),

From a7973a414a7092a97dd0df76b824e82cb185f803 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Tue, 12 May 2020 17:12:43 +0200
Subject: [PATCH 39/44] Added dynamic input to MQCNN decoder.

---
 src/gluonts/block/decoder.py                  | 94 -------------------
 src/gluonts/block/encoder.py                  |  2 +
 .../model/seq2seq/_forking_estimator.py       | 13 ++-
 src/gluonts/model/seq2seq/_forking_network.py | 30 ++++--
 .../model/seq2seq/_mq_dnn_estimator.py        |  5 +-
 src/gluonts/model/seq2seq/_transform.py       | 46 ++++-----
 test/model/seq2seq/test_model.py              |  6 +-
 7 files changed, 60 insertions(+), 136 deletions(-)

diff --git a/src/gluonts/block/decoder.py b/src/gluonts/block/decoder.py
index 6bbc816b2c..ccb5bd997a 100644
--- a/src/gluonts/block/decoder.py
+++ b/src/gluonts/block/decoder.py
@@ -137,100 +137,6 @@ def hybrid_forward(
         return mlp_output
 
 
-class ForkingMLPDecoderWithFutureFeat(Seq2SeqDecoder):
-    """
-    Multilayer perceptron decoder for sequence-to-sequence models.
-
-    See [WTN+17]_ for details.
-
-    Parameters
-    ----------
-    dec_len
-        length of the decoder (usually the number of forecasted time steps).
-
-    final_dim
-        dimensionality of the output per time step (number of predicted
-        quantiles).
-
-    hidden_dimension_sequence
-        number of hidden units for each MLP layer.
-    """
-
-    @validated()
-    def __init__(
-        self,
-        dec_len: int,
-        final_dim: int,
-        hidden_dimension_sequence: List[int] = [],
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-
-        self.dec_len = dec_len
-        self.final_dims = final_dim
-
-        with self.name_scope():
-            self.model = nn.HybridSequential()
-
-            for layer_no, layer_dim in enumerate(hidden_dimension_sequence):
-                layer = nn.Dense(
-                    dec_len * layer_dim,
-                    flatten=False,
-                    activation="relu",
-                    prefix=f"mlp_{layer_no:#02d}'_",
-                )
-                self.model.add(layer)
-
-            layer = nn.Dense(
-                dec_len * final_dim,
-                flatten=False,
-                activation="softrelu",
-                prefix=f"mlp_{len(hidden_dimension_sequence):#02d}'_",
-            )
-            self.model.add(layer)
-
-    # TODO: add support for static input at some point
-    def hybrid_forward(
-        self,
-        F,
-        dynamic_output_encoder: Tensor,
-        dynamic_input: Tensor,
-        static_output_encoder: Tensor = None,
-    ) -> Tensor:
-        """
-        ForkingMLPDecoder forward call.
-
-        Parameters
-        ----------
-        F
-            A module that can either refer to the Symbol API or the NDArray
-            API in MXNet.
-        dynamic_input_encoder
-            dynamic_features, shape (batch_size, sequence_length, num_features) or (N, T, C)
-            where sequence_length is equal to the encoder length, and num_features is equal
-            to channel_seq[-1] for the MQCNN for example.
-        dynamic_input
-            dynamic_features, shape (batch_size, encoder_length, decoder_length, num_features_02)
-            or (N, T, C).
-        static_input_encoder
-            not used in this decoder.
-
-        Returns
-        -------
-        Tensor
-            mlp output, shape (batch_size, encoder_length, dec_len, final_dims).
-
-        """
-        mlp_output = self.model(dynamic_output_encoder)
-        mlp_output = mlp_output.reshape(
-            shape=(0, 0, self.dec_len, self.final_dims)
-        )
-        # mlp_output = F.concat(
-        #     mlp_output, dynamic_input, dim=-1
-        # )  # TODO: would -1 work?
-        return mlp_output
-
-
 class OneShotDecoder(Seq2SeqDecoder):
     """
     OneShotDecoder.
diff --git a/src/gluonts/block/encoder.py b/src/gluonts/block/encoder.py
index 76e1cd7fa9..f2a4550d85 100644
--- a/src/gluonts/block/encoder.py
+++ b/src/gluonts/block/encoder.py
@@ -111,6 +111,7 @@ def _assemble_inputs(
         return inputs
 
 
+# TODO: fix handling of static features
 class HierarchicalCausalConv1DEncoder(Seq2SeqEncoder):
     """
     Defines a stack of dilated convolutions as the encoder.
@@ -216,6 +217,7 @@ def hybrid_forward(
         elif self.use_dynamic_feat:
             inputs = F.concat(target, dynamic_features, dim=2)  # (N, T, C)
         else:
+            # For now, static features only used when dynamic feat enabled
             inputs = target
 
         # NTC -> NCT (or NCW)
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 7211fa9904..cb9b5b421d 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -41,7 +41,6 @@
     RenameFields,
     AddConstFeature,
     RemoveFields,
-    AsNumpyArray,
     AddObservedValuesIndicator,
     SetField,
 )
@@ -280,9 +279,15 @@ def create_transformation(self) -> Transformation:
                 train_sampler=TestSplitSampler(),
                 enc_len=self.context_length,
                 dec_len=self.prediction_length,
-                encoder_series_fields=[FieldName.FEAT_DYNAMIC],
-                # decoder_series_fileds=[FieldName.FEAT_TIME],
-                shared_series_fields=[FieldName.OBSERVED_VALUES],
+                encoder_series_fields=[
+                    FieldName.OBSERVED_VALUES,
+                    FieldName.FEAT_DYNAMIC,
+                ],
+                decoder_series_fields=[
+                    FieldName.OBSERVED_VALUES,
+                    FieldName.FEAT_DYNAMIC,
+                ],
+                prediction_time_decoder_exclude=[FieldName.OBSERVED_VALUES],
             ),
         )
 
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 5b55af3db7..6a500a3729 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -104,6 +104,7 @@ def get_decoder_network_output(
         F,
         past_target: Tensor,
         past_feat_dynamic: Tensor,
+        future_feat_dynamic: Tensor,
         feat_static_cat: Tensor,
         past_observed_values: Tensor,
     ) -> Tensor:
@@ -144,10 +145,20 @@ def get_decoder_network_output(
             enc_output_static, enc_output_dynamic, F.zeros(shape=(1,))
         )
 
+        # flatten the last two dimensions:
+        # => (batch_size, encoder_length, decoder_length * num_feature_dynamic)
+        future_feat_dynamic = F.reshape(future_feat_dynamic, shape=(0, 0, -1))
+
+        # concatenate output of decoder and future_feat_dynamic covariates:
+        # => (batch_size, encoder_length, num_dec_input_dynamic + num_future_feat_dynamic)
+        total_dec_input_dynamic = F.concat(
+            dec_input_dynamic, future_feat_dynamic, dim=2
+        )
+
         # arguments: dynamic_input, static_input
         # TODO: optimize what we pass to the decoder for the prediction case,
         #  where we we only need to pass the encoder output for the last time step
-        dec_output = self.decoder(dec_input_dynamic, dec_input_static)
+        dec_output = self.decoder(total_dec_input_dynamic, dec_input_static)
 
         # the output shape should be: (batch_size, enc_len, dec_len, final_dims)
         return dec_output
@@ -158,9 +169,10 @@ class ForkingSeq2SeqTrainingNetwork(ForkingSeq2SeqNetworkBase):
     def hybrid_forward(
         self,
         F,
-        future_target: Tensor,
         past_target: Tensor,
+        future_target: Tensor,
         past_feat_dynamic: Tensor,
+        future_feat_dynamic: Tensor,
         feat_static_cat: Tensor,
         past_observed_values: Tensor,
         future_observed_values: Tensor,
@@ -170,17 +182,16 @@ def hybrid_forward(
         ----------
         F: mx.symbol or mx.ndarray
             Gluon function space
-        future_target: Tensor
-            shape (batch_size, encoder_length, decoder_length)
         past_target: Tensor
             shape (batch_size, encoder_length, 1)
-        feat_static_cat
-            shape (batch_size, encoder_length, num_feature_static_cat)
+        future_target: Tensor
+            shape (batch_size, encoder_length, decoder_length)
         past_feat_dynamic
             shape (batch_size, encoder_length, num_feature_dynamic)
         future_feat_dynamic
             shape (batch_size, encoder_length, decoder_length, num_feature_dynamic)
-            # or shape (batch_size, decoder_length, num_feature_dynamic) replicated for each of the encoder steps
+        feat_static_cat
+            shape (batch_size, encoder_length, num_feature_static_cat)
         past_observed_values: Tensor
             shape (batch_size, encoder_length, 1)
         future_observed_values: Tensor
@@ -194,6 +205,7 @@ def hybrid_forward(
             F,
             past_target,
             past_feat_dynamic,
+            future_feat_dynamic,
             feat_static_cat,
             past_observed_values,
         )
@@ -216,6 +228,7 @@ def hybrid_forward(
         F,
         past_target: Tensor,
         past_feat_dynamic: Tensor,
+        future_feat_dynamic: Tensor,
         feat_static_cat: Tensor,
         past_observed_values: Tensor,
     ) -> Tensor:
@@ -230,6 +243,8 @@ def hybrid_forward(
             shape (batch_size, encoder_length, num_feature_static_cat)
         past_feat_dynamic
             shape (batch_size, encoder_length, num_feature_dynamic)
+        future_feat_dynamic
+            shape (batch_size, encoder_length, decoder_length, num_feature_dynamic)
         past_observed_values: Tensor
             shape (batch_size, encoder_length, 1)
 
@@ -242,6 +257,7 @@ def hybrid_forward(
             F,
             past_target,
             past_feat_dynamic,
+            future_feat_dynamic,
             feat_static_cat,
             past_observed_values,
         )
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 88dfb695bc..dee7412fc1 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -157,6 +157,9 @@ def derive_auto_fields(cls, train_iter):
         }
 
     # FIXME: for now we always want the dataset to be cached and utilize multiprocessing.
+    # TODO it properly: Enable caching of the dataset in the `_load_datasets` function of the shell,
+    #  and pass `num_workers` from train_env in the `run_train_and_test` method to `run_train`,
+    #  which in turn has to pass it to train(...)
     def train(
         self,
         training_data: Dataset,
@@ -176,7 +179,7 @@ def train(
         num_workers = (
             num_workers
             if num_workers is not None
-            else int(multiprocessing.cpu_count() * (1 / 2))
+            else int(np.ceil(np.sqrt(multiprocessing.cpu_count())))
         )
 
         return super().train(
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index bc4ecf4a7e..a36231b2d5 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -47,7 +47,7 @@ def __init__(
         target_field=FieldName.TARGET,
         encoder_series_fields: Optional[List[str]] = None,
         decoder_series_fields: Optional[List[str]] = None,
-        shared_series_fields: Optional[List[str]] = None,
+        prediction_time_decoder_exclude: Optional[List[str]] = None,
         is_pad_out: str = "is_pad",
         start_input_field: str = "start",
     ) -> None:
@@ -61,16 +61,18 @@ def __init__(
         self.target_field = target_field
 
         self.encoder_series_fields = (
-            encoder_series_fields if encoder_series_fields is not None else []
+            encoder_series_fields + [self.target_field]
+            if encoder_series_fields is not None
+            else [self.target_field]
         )
         self.decoder_series_fields = (
-            decoder_series_fields if decoder_series_fields is not None else []
+            decoder_series_fields + [self.target_field]
+            if decoder_series_fields is not None
+            else [self.target_field]
         )
-        # defines the fields that are shared among encoder and decoder,
-        # this includes the target by default
-        self.shared_series_fields = (
-            shared_series_fields + [self.target_field]
-            if shared_series_fields is not None
+        self.prediction_time_decoder_exclude = (
+            prediction_time_decoder_exclude + [self.target_field]
+            if prediction_time_decoder_exclude is not None
             else [self.target_field]
         )
 
@@ -102,14 +104,8 @@ def flatmap_transform(
         else:
             sampling_indices = [len(target)]
 
-        decoder_fields = set(
-            self.shared_series_fields + self.decoder_series_fields
-        )
-
         ts_fields_counter = Counter(
-            self.encoder_series_fields
-            + self.shared_series_fields
-            + self.decoder_series_fields
+            set(self.encoder_series_fields + self.decoder_series_fields)
         )
 
         for sampling_idx in sampling_indices:
@@ -138,21 +134,19 @@ def flatmap_transform(
 
                 out[self._past(ts_field)] = past_piece.transpose()
 
-                # in prediction mode, don't provide decode-values
-                if not is_train and (ts_field in self.shared_series_fields):
+                # exclude some fields at prediction time
+                if (
+                    not is_train
+                    and ts_field in self.prediction_time_decoder_exclude
+                ):
                     continue
 
-                # TODO: do the same to the future dynamic feat as we do to the target
-
                 # This is were some of the forking magic happens:
                 # For each of the encoder_len time-steps at which the decoder is applied we slice the
                 # corresponding inputs called decoder_fields to the appropriate dec_len
-                if ts_field in decoder_fields:
-                    d3: Any = () if ts_field in self.shared_series_fields else (
-                        len(ts),
-                    )
+                if ts_field in self.decoder_series_fields:
                     forking_dec_field = np.zeros(
-                        shape=(self.enc_len, self.dec_len) + d3
+                        shape=(self.enc_len, self.dec_len, len(ts))
                     )
 
                     skip = max(0, self.enc_len - sampling_idx)
@@ -162,9 +156,9 @@ def flatmap_transform(
                         forking_dec_field[skip:],
                         range(start_idx + 1, start_idx + self.enc_len + 1),
                     ):
-                        dec_field[:] = ts[:, idx : idx + self.dec_len]
+                        dec_field[:] = ts[:, idx : idx + self.dec_len].T
 
-                    out[self._future(ts_field)] = forking_dec_field
+                    out[self._future(ts_field)] = np.squeeze(forking_dec_field)
 
             # So far pad indicator not in use
             pad_indicator = np.zeros(self.enc_len)
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index df54f1dbb8..85b3acea9d 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -31,7 +31,6 @@ def hyperparameters(dsinfo):
         num_batches_per_epoch=1,
         quantiles=[0.1, 0.5, 0.9],
         use_symbol_block_predictor=True,
-        num_workers=0,
     )
 
 
@@ -72,7 +71,6 @@ def test_mqcnn_covariate_smoke_test(
         "add_time_feature": add_time_feature,
         "add_age_feature": add_age_feature,
         "hybridize": hybridize,
-        "num_workers": 0,
     }
 
     dataset_train, dataset_test = make_dummy_datasets_with_features(
@@ -84,7 +82,7 @@ def test_mqcnn_covariate_smoke_test(
 
     estimator = MQCNNEstimator.from_hyperparameters(**hps)
 
-    predictor = estimator.train(dataset_train)
+    predictor = estimator.train(dataset_train, num_workers=0)
     forecasts = list(predictor.predict(dataset_test))
     assert len(forecasts) == len(dataset_test)
 
@@ -129,6 +127,6 @@ def test_backwards_compatibility():
 
     estimator = MQCNNEstimator.from_inputs(dataset_train, **hps)
 
-    predictor = estimator.train(dataset_train)
+    predictor = estimator.train(dataset_train, num_workers=0)
     forecasts = list(predictor.predict(dataset_test))
     assert len(forecasts) == len(dataset_test)

From 63b2565dc915dc525843564cc650310053853b80 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Tue, 12 May 2020 19:00:36 +0200
Subject: [PATCH 40/44] Added toggle option for dynamic future feat.

---
 src/gluonts/block/enc2dec.py                  | 83 +++++++++++++++----
 .../model/seq2seq/_forking_estimator.py       | 19 +++--
 src/gluonts/model/seq2seq/_forking_network.py | 16 +---
 .../model/seq2seq/_mq_dnn_estimator.py        |  2 +
 src/gluonts/model/seq2seq/_seq2seq_network.py |  2 +-
 src/gluonts/model/seq2seq/_transform.py       | 31 +++++--
 test/model/seq2seq/test_model.py              |  8 +-
 7 files changed, 114 insertions(+), 47 deletions(-)

diff --git a/src/gluonts/block/enc2dec.py b/src/gluonts/block/enc2dec.py
index 7f61a63f59..9bc58f8567 100644
--- a/src/gluonts/block/enc2dec.py
+++ b/src/gluonts/block/enc2dec.py
@@ -38,7 +38,7 @@ def hybrid_forward(
         F,
         encoder_output_static: Tensor,
         encoder_output_dynamic: Tensor,
-        future_features: Tensor,
+        future_features_dynamic: Tensor,
     ) -> Tuple[Tensor, Tensor, Tensor]:
         """
         Parameters
@@ -48,10 +48,10 @@ def hybrid_forward(
             shape (batch_size, num_features) or (N, C)
 
         encoder_output_dynamic
-            shape (batch_size, context_length, num_features) or (N, T, C)
+            shape (batch_size, sequence_length, num_features) or (N, T, C)
 
-        future_features
-            shape (batch_size, prediction_length, num_features) or (N, T, C)
+        future_features_dynamic
+            shape (batch_size, sequence_length, prediction_length, num_features) or (N, T, P, C`)
 
 
         Returns
@@ -59,12 +59,8 @@ def hybrid_forward(
         Tensor
             shape (batch_size, num_features) or (N, C)
 
-        Tensor
-            shape (batch_size, prediction_length, num_features) or (N, T, C)
-
         Tensor
             shape (batch_size, sequence_length, num_features) or (N, T, C)
-
         """
         pass
 
@@ -72,7 +68,7 @@ def hybrid_forward(
 class PassThroughEnc2Dec(Seq2SeqEnc2Dec):
     """
     Simplest class for passing encoder tensors do decoder. Passes through
-    tensors.
+    tensors, except that future_features_dynamic is dropped.
     """
 
     def hybrid_forward(
@@ -80,8 +76,8 @@ def hybrid_forward(
         F,
         encoder_output_static: Tensor,
         encoder_output_dynamic: Tensor,
-        future_features: Tensor,
-    ) -> Tuple[Tensor, Tensor, Tensor]:
+        future_features_dynamic: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
         """
         Parameters
         ----------
@@ -90,10 +86,10 @@ def hybrid_forward(
             shape (batch_size, num_features) or (N, C)
 
         encoder_output_dynamic
-            shape (batch_size, context_length, num_features) or (N, T, C)
+            shape (batch_size, sequence_length, num_features) or (N, T, C)
 
-        future_features
-            shape (batch_size, prediction_length, num_features) or (N, T, C)
+        future_features_dynamic
+            shape (batch_size, sequence_length, prediction_length, num_features) or (N, T, P, C`)
 
 
         Returns
@@ -102,10 +98,63 @@ def hybrid_forward(
             shape (batch_size, num_features) or (N, C)
 
         Tensor
-            shape (batch_size, prediction_length, num_features) or (N, T, C)
+            shape (batch_size, prediction_length, num_features_02) or (N, T, C)
+        """
+        return encoder_output_static, encoder_output_dynamic
 
-        Tensor
+
+class FutureFeatIntegratorEnc2Dec(Seq2SeqEnc2Dec):
+    """
+    Integrates the encoder_ouput_dynamic and future_features_dynamic into one
+    and passes them through as the dynamic input to the decoder.
+    """
+
+    def hybrid_forward(
+        self,
+        F,
+        encoder_output_static: Tensor,
+        encoder_output_dynamic: Tensor,
+        future_features_dynamic: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Parameters
+        ----------
+
+        encoder_output_static
+            shape (batch_size, num_features) or (N, C)
+
+        encoder_output_dynamic
             shape (batch_size, sequence_length, num_features) or (N, T, C)
 
+        future_features_dynamic
+            shape (batch_size, sequence_length, prediction_length, num_features) or (N, T, P, C`)
+
+
+        Returns
+        -------
+        Tensor
+            shape (batch_size, num_features) or (N, C)
+
+        Tensor
+            shape (batch_size, prediction_length, num_features_02) or (N, T, C)
+
+        Tensor
+            shape (1,)
         """
-        return encoder_output_static, encoder_output_dynamic, future_features
+
+        # flatten the last two dimensions:
+        # => (batch_size, encoder_length, decoder_length * num_feature_dynamic)
+        future_features_dynamic = F.reshape(
+            future_features_dynamic, shape=(0, 0, -1)
+        )
+
+        # concatenate output of decoder and future_feat_dynamic covariates:
+        # => (batch_size, encoder_length, num_dec_input_dynamic + num_future_feat_dynamic)
+        total_dec_input_dynamic = F.concat(
+            encoder_output_dynamic, future_features_dynamic, dim=2
+        )
+
+        return (
+            encoder_output_static,
+            total_dec_input_dynamic,
+        )
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index cb9b5b421d..b06d1a55a2 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -19,7 +19,10 @@
 
 # First-party imports
 from gluonts.block.decoder import Seq2SeqDecoder
-from gluonts.block.enc2dec import PassThroughEnc2Dec
+from gluonts.block.enc2dec import (
+    PassThroughEnc2Dec,
+    FutureFeatIntegratorEnc2Dec,
+)
 from gluonts.block.encoder import Seq2SeqEncoder
 from gluonts.block.quantile_output import QuantileOutput
 from gluonts.core.component import validated, DType
@@ -127,6 +130,7 @@ def __init__(
         embedding_dimension: List[int] = None,
         add_time_feature: bool = False,
         add_age_feature: bool = False,
+        enable_decoder_dynamic_feature: bool = True,
         trainer: Trainer = Trainer(),
         dtype: DType = np.float32,
     ) -> None:
@@ -173,6 +177,7 @@ def __init__(
         self.use_dynamic_feat = (
             use_feat_dynamic_real or add_age_feature or add_time_feature
         )
+        self.enable_decoder_dynamic_feature = enable_decoder_dynamic_feature
         self.dtype = dtype
 
     def create_transformation(self) -> Transformation:
@@ -283,10 +288,12 @@ def create_transformation(self) -> Transformation:
                     FieldName.OBSERVED_VALUES,
                     FieldName.FEAT_DYNAMIC,
                 ],
-                decoder_series_fields=[
-                    FieldName.OBSERVED_VALUES,
-                    FieldName.FEAT_DYNAMIC,
-                ],
+                decoder_series_fields=[FieldName.OBSERVED_VALUES]
+                + (
+                    [FieldName.FEAT_DYNAMIC]
+                    if self.enable_decoder_dynamic_feature
+                    else []
+                ),
                 prediction_time_decoder_exclude=[FieldName.OBSERVED_VALUES],
             ),
         )
@@ -296,7 +303,7 @@ def create_transformation(self) -> Transformation:
     def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
         return ForkingSeq2SeqTrainingNetwork(
             encoder=self.encoder,
-            enc2dec=PassThroughEnc2Dec(),
+            enc2dec=FutureFeatIntegratorEnc2Dec(),
             decoder=self.decoder,
             quantile_output=self.quantile_output,
             context_length=self.context_length,
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 6a500a3729..4bfc1a5821 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -141,24 +141,14 @@ def get_decoder_network_output(
         )
 
         # arguments: encoder_output_static, encoder_output_dynamic, future_features
-        dec_input_static, dec_input_dynamic, _ = self.enc2dec(
-            enc_output_static, enc_output_dynamic, F.zeros(shape=(1,))
-        )
-
-        # flatten the last two dimensions:
-        # => (batch_size, encoder_length, decoder_length * num_feature_dynamic)
-        future_feat_dynamic = F.reshape(future_feat_dynamic, shape=(0, 0, -1))
-
-        # concatenate output of decoder and future_feat_dynamic covariates:
-        # => (batch_size, encoder_length, num_dec_input_dynamic + num_future_feat_dynamic)
-        total_dec_input_dynamic = F.concat(
-            dec_input_dynamic, future_feat_dynamic, dim=2
+        dec_input_static, dec_input_dynamic = self.enc2dec(
+            enc_output_static, enc_output_dynamic, future_feat_dynamic
         )
 
         # arguments: dynamic_input, static_input
         # TODO: optimize what we pass to the decoder for the prediction case,
         #  where we we only need to pass the encoder output for the last time step
-        dec_output = self.decoder(total_dec_input_dynamic, dec_input_static)
+        dec_output = self.decoder(dec_input_dynamic, dec_input_static)
 
         # the output shape should be: (batch_size, enc_len, dec_len, final_dims)
         return dec_output
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index dee7412fc1..f8576fc1af 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -48,6 +48,7 @@ def __init__(
         embedding_dimension: List[int] = None,
         add_time_feature: bool = False,
         add_age_feature: bool = False,
+        enable_decoder_dynamic_feature: bool = True,
         seed: Optional[int] = None,
         decoder_mlp_dim_seq: Optional[List[int]] = None,
         channels_seq: Optional[List[int]] = None,
@@ -139,6 +140,7 @@ def __init__(
             context_length=context_length,
             use_feat_dynamic_real=use_feat_dynamic_real,
             use_feat_static_cat=use_feat_static_cat,
+            enable_decoder_dynamic_feature=enable_decoder_dynamic_feature,
             cardinality=cardinality,
             embedding_dimension=embedding_dimension,
             add_time_feature=add_time_feature,
diff --git a/src/gluonts/model/seq2seq/_seq2seq_network.py b/src/gluonts/model/seq2seq/_seq2seq_network.py
index 4db4e8dd8b..566b41297b 100644
--- a/src/gluonts/model/seq2seq/_seq2seq_network.py
+++ b/src/gluonts/model/seq2seq/_seq2seq_network.py
@@ -89,7 +89,7 @@ def compute_decoder_outputs(
         encoder_output_static, encoder_output_dynamic = self.encoder(
             scaled_target, embedded_cat, past_feat_dynamic_real
         )
-        decoder_input_static, _, decoder_input_dynamic = self.enc2dec(
+        decoder_input_static, decoder_input_dynamic = self.enc2dec(
             encoder_output_static,
             encoder_output_dynamic,
             future_feat_dynamic_real,
diff --git a/src/gluonts/model/seq2seq/_transform.py b/src/gluonts/model/seq2seq/_transform.py
index a36231b2d5..9ab15422cb 100644
--- a/src/gluonts/model/seq2seq/_transform.py
+++ b/src/gluonts/model/seq2seq/_transform.py
@@ -70,12 +70,19 @@ def __init__(
             if decoder_series_fields is not None
             else [self.target_field]
         )
+
+        # Fields that are not used at prediction time for the decoder
         self.prediction_time_decoder_exclude = (
             prediction_time_decoder_exclude + [self.target_field]
             if prediction_time_decoder_exclude is not None
             else [self.target_field]
         )
 
+        # Fields that are disabled for the decoder (dummy fields still created)
+        self.decoder_disabled_fields = list(
+            set(self.encoder_series_fields) - set(self.decoder_series_fields)
+        )
+
         self.is_pad_out = is_pad_out
         self.start_in = start_input_field
 
@@ -144,19 +151,25 @@ def flatmap_transform(
                 # This is were some of the forking magic happens:
                 # For each of the encoder_len time-steps at which the decoder is applied we slice the
                 # corresponding inputs called decoder_fields to the appropriate dec_len
-                if ts_field in self.decoder_series_fields:
+                if (
+                    ts_field
+                    in self.decoder_series_fields
+                    + self.decoder_disabled_fields
+                ):
                     forking_dec_field = np.zeros(
                         shape=(self.enc_len, self.dec_len, len(ts))
                     )
 
-                    skip = max(0, self.enc_len - sampling_idx)
-                    # This section takes by far the longest time computationally:
-                    # This scales linearly in self.enc_len and linearly in self.dec_len
-                    for dec_field, idx in zip(
-                        forking_dec_field[skip:],
-                        range(start_idx + 1, start_idx + self.enc_len + 1),
-                    ):
-                        dec_field[:] = ts[:, idx : idx + self.dec_len].T
+                    # in case it's not disabled we copy the actual values
+                    if ts_field not in self.decoder_disabled_fields:
+                        skip = max(0, self.enc_len - sampling_idx)
+                        # This section takes by far the longest time computationally:
+                        # This scales linearly in self.enc_len and linearly in self.dec_len
+                        for dec_field, idx in zip(
+                            forking_dec_field[skip:],
+                            range(start_idx + 1, start_idx + self.enc_len + 1),
+                        ):
+                            dec_field[:] = ts[:, idx : idx + self.dec_len].T
 
                     out[self._future(ts_field)] = np.squeeze(forking_dec_field)
 
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index 85b3acea9d..a6f8f054e8 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -56,9 +56,14 @@ def test_accuracy(
 @pytest.mark.parametrize("use_feat_dynamic_real", [True, False])
 @pytest.mark.parametrize("add_time_feature", [True, False])
 @pytest.mark.parametrize("add_age_feature", [True, False])
+@pytest.mark.parametrize("enable_decoder_dynamic_feature", [True, False])
 @pytest.mark.parametrize("hybridize", [True, False])
 def test_mqcnn_covariate_smoke_test(
-    use_feat_dynamic_real, add_time_feature, add_age_feature, hybridize
+    use_feat_dynamic_real,
+    add_time_feature,
+    add_age_feature,
+    enable_decoder_dynamic_feature,
+    hybridize,
 ):
     hps = {
         "seed": 42,
@@ -70,6 +75,7 @@ def test_mqcnn_covariate_smoke_test(
         "use_feat_dynamic_real": use_feat_dynamic_real,
         "add_time_feature": add_time_feature,
         "add_age_feature": add_age_feature,
+        "enable_decoder_dynamic_feature": enable_decoder_dynamic_feature,
         "hybridize": hybridize,
     }
 

From 18e45bc2d0058a6a7704612df36b89eb1514395c Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Tue, 12 May 2020 19:08:25 +0200
Subject: [PATCH 41/44] Changed default of future dynamic to disabled.

---
 src/gluonts/model/seq2seq/_forking_estimator.py | 2 +-
 src/gluonts/model/seq2seq/_mq_dnn_estimator.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index b06d1a55a2..1613186e8a 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -130,7 +130,7 @@ def __init__(
         embedding_dimension: List[int] = None,
         add_time_feature: bool = False,
         add_age_feature: bool = False,
-        enable_decoder_dynamic_feature: bool = True,
+        enable_decoder_dynamic_feature: bool = False,
         trainer: Trainer = Trainer(),
         dtype: DType = np.float32,
     ) -> None:
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index f8576fc1af..7080f871ea 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -48,7 +48,7 @@ def __init__(
         embedding_dimension: List[int] = None,
         add_time_feature: bool = False,
         add_age_feature: bool = False,
-        enable_decoder_dynamic_feature: bool = True,
+        enable_decoder_dynamic_feature: bool = False,
         seed: Optional[int] = None,
         decoder_mlp_dim_seq: Optional[List[int]] = None,
         channels_seq: Optional[List[int]] = None,

From c636cd84d904696a0c1b43b2e361ad3fd0fd6792 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Wed, 13 May 2020 14:17:28 +0200
Subject: [PATCH 42/44] Turning user specified arguments into implications.

---
 .../model/seq2seq/_mq_dnn_estimator.py        | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 7080f871ea..cc011f8786 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -18,6 +18,7 @@
 # Third-party imports
 import numpy as np
 import mxnet as mx
+import logging
 
 # First-party imports
 from gluonts.dataset.common import Dataset, ListDataset
@@ -106,6 +107,8 @@ def __init__(
             f"{len(self.dilation_seq)} vs. {len(self.kernel_size_seq)}"
         )
 
+        print("Use dynamic real", use_feat_dynamic_real)
+
         if seed:
             np.random.seed(seed)
             mx.random.seed(seed)
@@ -181,9 +184,12 @@ def train(
         num_workers = (
             num_workers
             if num_workers is not None
-            else int(np.ceil(np.sqrt(multiprocessing.cpu_count())))
+            else min(4, int(np.ceil(np.sqrt(multiprocessing.cpu_count()))))
         )
 
+        logger = logging.getLogger(__name__)
+        logger.info(f"gluonts[multiprocessing]: num_workers={num_workers}")
+
         return super().train(
             training_data=cached_train_data,
             validation_data=cached_validation_data,
@@ -192,6 +198,39 @@ def train(
             **kwargs,
         )
 
+    @classmethod
+    def from_inputs(cls, train_iter, **params):
+        # auto_params usually include `use_feat_dynamic_real`, `use_feat_static_cat` and `cardinality`
+        auto_params = cls.derive_auto_fields(train_iter)
+
+        # user defined arguments become implications
+        if (
+            "use_feat_dynamic_real" in params.keys()
+            and params["use_feat_dynamic_real"]
+            and not auto_params["use_feat_dynamic_real"]
+        ):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"gluonts[from_inputs]: use_feat_dynamic_real set to False since it is not present in the data."
+            )
+            params["use_feat_dynamic_real"] = False
+
+        if (
+            "use_feat_static_cat" in params.keys()
+            and params["use_feat_static_cat"]
+            and not auto_params["use_feat_static_cat"]
+        ):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"gluonts[from_inputs]: use_feat_static_cat set to False since it is not present in the data."
+            )
+            params["use_feat_static_cat"] = False
+            params["cardinality"] = None
+
+        # user specified 'params' will take precedence:
+        params = {**auto_params, **params}
+        return cls.from_hyperparameters(**params)
+
 
 class MQRNNEstimator(ForkingSeq2SeqEstimator):
     """

From 442edf560df48df29178b219002e3a2738db6dde Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Mon, 18 May 2020 21:09:49 +0200
Subject: [PATCH 43/44] Adding documentation for MQCNN parameters, removing non
 gluonts code.

---
 src/gluonts/block/encoder.py                  |   2 +-
 .../model/seq2seq/_forking_estimator.py       |  39 +++--
 src/gluonts/model/seq2seq/_forking_network.py |  12 +-
 .../model/seq2seq/_mq_dnn_estimator.py        | 148 +++++++++---------
 test/model/seq2seq/test_model.py              |  27 ++++
 5 files changed, 134 insertions(+), 94 deletions(-)

diff --git a/src/gluonts/block/encoder.py b/src/gluonts/block/encoder.py
index f2a4550d85..ceff6b740f 100644
--- a/src/gluonts/block/encoder.py
+++ b/src/gluonts/block/encoder.py
@@ -238,7 +238,7 @@ def hybrid_forward(
 
 class RNNEncoder(Seq2SeqEncoder):
     """
-     Defines RNN encoder that uses covariates and target as input to the RNN if desired.
+    Defines RNN encoder that uses covariates and target as input to the RNN if desired.
 
     Parameters
     ----------
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 1613186e8a..936782f37e 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -19,10 +19,7 @@
 
 # First-party imports
 from gluonts.block.decoder import Seq2SeqDecoder
-from gluonts.block.enc2dec import (
-    PassThroughEnc2Dec,
-    FutureFeatIntegratorEnc2Dec,
-)
+from gluonts.block.enc2dec import FutureFeatIntegratorEnc2Dec
 from gluonts.block.encoder import Seq2SeqEncoder
 from gluonts.block.quantile_output import QuantileOutput
 from gluonts.core.component import validated, DType
@@ -89,11 +86,11 @@ class ForkingSeq2SeqEstimator(GluonEstimator):
     quantile_output
         quantile output
     freq
-        frequency of the time series
+        frequency of the time series.
     prediction_length
-        length of the decoding sequence
+        length of the decoding sequence.
     context_length
-        length of the encoding sequence (prediction_length is used if None)
+        length of the encoding sequence (default: 4 * prediction_length)
     use_feat_dynamic_real
         Whether to use the ``feat_dynamic_real`` field from the data (default: False)
     use_feat_static_cat:
@@ -105,12 +102,18 @@ class ForkingSeq2SeqEstimator(GluonEstimator):
         Dimension of the embeddings for categorical features
         (default: [min(50, (cat+1)//2) for cat in cardinality])
     add_time_feature
-        Adds a set of time features.
+        Adds a set of time features.  (default: False)
     add_age_feature
-        Adds an age feature.
+        Adds an age feature. (default: False)
         The age feature starts with a small value at the start of the time series and grows over time.
+    enable_decoder_dynamic_feature
+        Whether the decoder should also be provided with the dynamic features (``age``, ``time``
+        and ``feat_dynamic_real`` if enabled respectively). (default: True)
+        It makes sense to disable this, if you dont have ``feat_dynamic_real`` for the prediction range.
     trainer
         trainer (default: Trainer())
+    scaling
+        Whether to automatically scale the target values (default: False)
     dtype
         (default: np.float32)
     """
@@ -128,10 +131,11 @@ def __init__(
         use_feat_static_cat: bool = False,
         cardinality: List[int] = None,
         embedding_dimension: List[int] = None,
-        add_time_feature: bool = False,
-        add_age_feature: bool = False,
-        enable_decoder_dynamic_feature: bool = False,
+        add_time_feature: bool = True,
+        add_age_feature: bool = True,
+        enable_decoder_dynamic_feature: bool = True,
         trainer: Trainer = Trainer(),
+        scaling: bool = False,
         dtype: DType = np.float32,
     ) -> None:
         super().__init__(trainer=trainer)
@@ -160,7 +164,7 @@ def __init__(
         self.context_length = (
             context_length
             if context_length is not None
-            else self.prediction_length
+            else 4 * self.prediction_length
         )
         self.use_feat_dynamic_real = use_feat_dynamic_real
         self.use_feat_static_cat = use_feat_static_cat
@@ -178,6 +182,7 @@ def __init__(
             use_feat_dynamic_real or add_age_feature or add_time_feature
         )
         self.enable_decoder_dynamic_feature = enable_decoder_dynamic_feature
+        self.scaling = scaling
         self.dtype = dtype
 
     def create_transformation(self) -> Transformation:
@@ -235,7 +240,11 @@ def create_transformation(self) -> Transformation:
         if self.use_feat_dynamic_real:
             # Backwards compatibility:
             chain.append(
-                RenameFields({"dynamic_feat": FieldName.FEAT_DYNAMIC_REAL})
+                RenameFields(
+                    {
+                        FieldName.FEAT_DYNAMIC_REAL_LEGACY: FieldName.FEAT_DYNAMIC_REAL
+                    }
+                )
             )
             dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
 
@@ -309,6 +318,7 @@ def create_training_network(self) -> ForkingSeq2SeqNetworkBase:
             context_length=self.context_length,
             cardinality=self.cardinality,
             embedding_dimension=self.embedding_dimension,
+            scaling=self.scaling,
             dtype=self.dtype,
         )
 
@@ -331,6 +341,7 @@ def create_predictor(
             context_length=self.context_length,
             cardinality=self.cardinality,
             embedding_dimension=self.embedding_dimension,
+            scaling=self.scaling,
             dtype=self.dtype,
         )
 
diff --git a/src/gluonts/model/seq2seq/_forking_network.py b/src/gluonts/model/seq2seq/_forking_network.py
index 4bfc1a5821..735431b648 100644
--- a/src/gluonts/model/seq2seq/_forking_network.py
+++ b/src/gluonts/model/seq2seq/_forking_network.py
@@ -52,6 +52,8 @@ class ForkingSeq2SeqNetworkBase(gluon.HybridBlock):
         number of values of each categorical feature.
     embedding_dimension: List[int],
         dimension of the embeddings for categorical features
+    scaling
+        Whether to automatically scale the target values (default: True)
     dtype
         (default: np.float32)
     kwargs: dict
@@ -68,6 +70,7 @@ def __init__(
         context_length: int,
         cardinality: List[int],
         embedding_dimension: List[int],
+        scaling: bool = True,
         dtype: DType = np.float32,
         **kwargs,
     ) -> None:
@@ -80,11 +83,10 @@ def __init__(
         self.context_length = context_length
         self.cardinality = cardinality
         self.embedding_dimension = embedding_dimension
+        self.scaling = scaling
         self.dtype = dtype
 
-        # TODO: implement scaling
-        scaling = False
-        if scaling:
+        if self.scaling:
             self.scaler = MeanScaler(keepdims=True)
         else:
             self.scaler = NOPScaler(keepdims=True)
@@ -111,7 +113,7 @@ def get_decoder_network_output(
 
         # scale is computed on the context length last units of the past target
         # scale shape is (batch_size, 1, *target_shape)
-        _, scale = self.scaler(
+        scaled_past_target, scale = self.scaler(
             past_target.slice_axis(
                 axis=1, begin=-self.context_length, end=None
             ),
@@ -137,7 +139,7 @@ def get_decoder_network_output(
 
         # arguments: target, static_features, dynamic_features
         enc_output_static, enc_output_dynamic = self.encoder(
-            past_target, feat_static_real, past_feat_dynamic_extended
+            scaled_past_target, feat_static_real, past_feat_dynamic_extended
         )
 
         # arguments: encoder_output_static, encoder_output_dynamic, future_features
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index cc011f8786..5f1a922485 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -35,6 +35,65 @@ class MQCNNEstimator(ForkingSeq2SeqEstimator):
     """
     An :class:`MQDNNEstimator` with a Convolutional Neural Network (CNN) as an
     encoder and a multi-quantile MLP as a decoder. Implements the MQ-CNN Forecaster, proposed in [WTN+17]_.
+
+    Parameters
+    ----------
+    freq
+        Time granularity of the data.
+    prediction_length
+        Length of the prediction, also known as 'horizon'.
+    context_length
+        Number of time units that condition the predictions, also known as 'lookback period'.
+        (default: 4 * prediction_length)
+    use_feat_dynamic_real
+        Whether to use the ``feat_dynamic_real`` field from the data. (default: False)
+        Automatically inferred when creating the MQCNNEstimator with the `from_inputs` class method.
+    use_feat_static_cat:
+        Whether to use the ``feat_static_cat`` field from the data. (default: False)
+        Automatically inferred when creating the MQCNNEstimator with the `from_inputs` class method.
+    cardinality:
+        Number of values of each categorical feature.
+        This must be set if ``use_feat_static_cat == True`` (default: None)
+        Automatically inferred when creating the MQCNNEstimator with the `from_inputs` class method.
+    embedding_dimension:
+        Dimension of the embeddings for categorical features. (default: [min(50, (cat+1)//2) for cat in cardinality])
+    add_time_feature
+        Adds a set of time features. (default: False)
+    add_age_feature
+        Adds an age feature. (default: False)
+        The age feature starts with a small value at the start of the time series and grows over time.
+    enable_decoder_dynamic_feature
+        Whether the decoder should also be provided with the dynamic features (``age``, ``time``
+        and ``feat_dynamic_real`` if enabled respectively). (default: True)
+        It makes sense to disable this, if you dont have ``feat_dynamic_real`` for the prediction range.
+    seed
+        Will set the specified int seed for numpy anc MXNet if specified. (default: None)
+    decoder_mlp_dim_seq
+        The dimensionalities of the Multi Layer Perceptron layers of the decoder.
+        (default: [30])
+    channels_seq
+        The number of channels (i.e. filters or convolutions) for each layer of the HierarchicalCausalConv1DEncoder.
+        More channels usually correspond to better performance and larger network size.
+        (default: [30, 30, 30])
+    dilation_seq
+        The dilation of the convolutions in each layer of the HierarchicalCausalConv1DEncoder.
+        Greater numbers correspond to a greater receptive field of the network, which is usually
+        better with longer context_length. (Same length as channels_seq) (default: [1, 3, 5])
+    kernel_size_seq
+        The kernel sizes (i.e. window size) of the convolutions in each layer of the HierarchicalCausalConv1DEncoder.
+        (Same length as channels_seq) (default: [7, 3, 3])
+    use_residual
+        Whether the hierarchical encoder should additionally pass the unaltered
+        past target to the decoder. (default: True)
+    quantiles
+        The list of quantiles that will be optimized for, and predicted by, the model.
+        Optimizing for more quantiles than are of direct interest to you can result
+        in improved performance due to a regularizing effect.
+        (default: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
+    trainer
+        The GluonTS trainer to use for training. (default: Trainer())
+    scaling
+        Whether to automatically scale the target values. (default: False)
     """
 
     @validated()
@@ -58,6 +117,7 @@ def __init__(
         use_residual: bool = True,
         quantiles: Optional[List[float]] = None,
         trainer: Trainer = Trainer(),
+        scaling: bool = False,
     ) -> None:
 
         assert (
@@ -81,16 +141,16 @@ def __init__(
         ), "Elements of `quantiles` should be >= 0 and <= 1"
 
         self.decoder_mlp_dim_seq = (
-            decoder_mlp_dim_seq if decoder_mlp_dim_seq is not None else [20]
+            decoder_mlp_dim_seq if decoder_mlp_dim_seq is not None else [30]
         )
         self.channels_seq = (
             channels_seq if channels_seq is not None else [30, 30, 30]
         )
         self.dilation_seq = (
-            dilation_seq if dilation_seq is not None else [1, 3, 9]
+            dilation_seq if dilation_seq is not None else [1, 3, 5]
         )
         self.kernel_size_seq = (
-            kernel_size_seq if kernel_size_seq is not None else [3, 3, 3]
+            kernel_size_seq if kernel_size_seq is not None else [7, 3, 3]
         )
         self.quantiles = (
             quantiles
@@ -107,8 +167,6 @@ def __init__(
             f"{len(self.dilation_seq)} vs. {len(self.kernel_size_seq)}"
         )
 
-        print("Use dynamic real", use_feat_dynamic_real)
-
         if seed:
             np.random.seed(seed)
             mx.random.seed(seed)
@@ -149,87 +207,25 @@ def __init__(
             add_time_feature=add_time_feature,
             add_age_feature=add_age_feature,
             trainer=trainer,
+            scaling=scaling,
         )
 
     @classmethod
     def derive_auto_fields(cls, train_iter):
         stats = calculate_dataset_statistics(train_iter)
 
-        return {
+        auto_fields = {
             "use_feat_dynamic_real": stats.num_feat_dynamic_real > 0,
             "use_feat_static_cat": bool(stats.feat_static_cat),
             "cardinality": [len(cats) for cats in stats.feat_static_cat],
         }
 
-    # FIXME: for now we always want the dataset to be cached and utilize multiprocessing.
-    # TODO it properly: Enable caching of the dataset in the `_load_datasets` function of the shell,
-    #  and pass `num_workers` from train_env in the `run_train_and_test` method to `run_train`,
-    #  which in turn has to pass it to train(...)
-    def train(
-        self,
-        training_data: Dataset,
-        validation_data: Optional[Dataset] = None,
-        num_workers: Optional[int] = None,
-        num_prefetch: Optional[int] = None,
-        **kwargs,
-    ):
-        cached_train_data = ListDataset(
-            data_iter=list(training_data), freq=self.freq
-        )
-        cached_validation_data = (
-            None
-            if validation_data is None
-            else ListDataset(data_iter=list(validation_data), freq=self.freq)
-        )
-        num_workers = (
-            num_workers
-            if num_workers is not None
-            else min(4, int(np.ceil(np.sqrt(multiprocessing.cpu_count()))))
-        )
-
         logger = logging.getLogger(__name__)
-        logger.info(f"gluonts[multiprocessing]: num_workers={num_workers}")
-
-        return super().train(
-            training_data=cached_train_data,
-            validation_data=cached_validation_data,
-            num_workers=num_workers,
-            num_prefetch=num_prefetch,
-            **kwargs,
+        logger.info(
+            f"gluonts[from_inputs]: use_feat_dynamic_real set to '{auto_fields['use_feat_dynamic_real']}', and use use_feat_static_cat to '{auto_fields['use_feat_static_cat']}' with cardinality of '{auto_fields['cardinality']}'"
         )
 
-    @classmethod
-    def from_inputs(cls, train_iter, **params):
-        # auto_params usually include `use_feat_dynamic_real`, `use_feat_static_cat` and `cardinality`
-        auto_params = cls.derive_auto_fields(train_iter)
-
-        # user defined arguments become implications
-        if (
-            "use_feat_dynamic_real" in params.keys()
-            and params["use_feat_dynamic_real"]
-            and not auto_params["use_feat_dynamic_real"]
-        ):
-            logger = logging.getLogger(__name__)
-            logger.warning(
-                f"gluonts[from_inputs]: use_feat_dynamic_real set to False since it is not present in the data."
-            )
-            params["use_feat_dynamic_real"] = False
-
-        if (
-            "use_feat_static_cat" in params.keys()
-            and params["use_feat_static_cat"]
-            and not auto_params["use_feat_static_cat"]
-        ):
-            logger = logging.getLogger(__name__)
-            logger.warning(
-                f"gluonts[from_inputs]: use_feat_static_cat set to False since it is not present in the data."
-            )
-            params["use_feat_static_cat"] = False
-            params["cardinality"] = None
-
-        # user specified 'params' will take precedence:
-        params = {**auto_params, **params}
-        return cls.from_hyperparameters(**params)
+        return auto_fields
 
 
 class MQRNNEstimator(ForkingSeq2SeqEstimator):
@@ -247,6 +243,7 @@ def __init__(
         decoder_mlp_dim_seq: List[int] = None,
         trainer: Trainer = Trainer(),
         quantiles: List[float] = None,
+        scaling: bool = True,
     ) -> None:
 
         assert (
@@ -260,10 +257,12 @@ def __init__(
         ), "Elements of `quantiles` should be >= 0 and <= 1"
 
         self.decoder_mlp_dim_seq = (
-            decoder_mlp_dim_seq if decoder_mlp_dim_seq is not None else [20]
+            decoder_mlp_dim_seq if decoder_mlp_dim_seq is not None else [30]
         )
         self.quantiles = (
-            quantiles if quantiles is not None else [0.1, 0.5, 0.9]
+            quantiles
+            if quantiles is not None
+            else [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
         )
 
         # `use_static_feat` and `use_dynamic_feat` always True because network
@@ -295,4 +294,5 @@ def __init__(
             prediction_length=prediction_length,
             context_length=context_length,
             trainer=trainer,
+            scaling=scaling,
         )
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index a6f8f054e8..f9e6beee14 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -93,6 +93,33 @@ def test_mqcnn_covariate_smoke_test(
     assert len(forecasts) == len(dataset_test)
 
 
+# Test scaling and from inputs
+@pytest.mark.parametrize("scaling", [True, False])
+def test_mqcnn_scaling_smoke_test(scaling):
+    hps = {
+        "seed": 42,
+        "freq": "D",
+        "prediction_length": 3,
+        "quantiles": [0.5, 0.1],
+        "epochs": 3,
+        "num_batches_per_epoch": 3,
+        "scaling": scaling,
+    }
+
+    dataset_train, dataset_test = make_dummy_datasets_with_features(
+        cardinality=[3, 10],
+        num_feat_dynamic_real=2,
+        freq=hps["freq"],
+        prediction_length=hps["prediction_length"],
+    )
+
+    estimator = MQCNNEstimator.from_inputs(dataset_train, **hps)
+
+    predictor = estimator.train(dataset_train, num_workers=0)
+    forecasts = list(predictor.predict(dataset_test))
+    assert len(forecasts) == len(dataset_test)
+
+
 def test_repr(Estimator, repr_test, hyperparameters):
     repr_test(Estimator, hyperparameters)
 

From d5699faed3fcba9f6485fa3010e06b27e9d60e35 Mon Sep 17 00:00:00 2001
From: Aaron Spieler <aspiele@amazon.com>
Date: Mon, 18 May 2020 21:24:49 +0200
Subject: [PATCH 44/44] Removing backwards compatibility.

---
 src/gluonts/dataset/field_names.py            |  1 -
 src/gluonts/dataset/stat.py                   |  2 -
 .../model/seq2seq/_forking_estimator.py       |  8 ----
 .../model/seq2seq/_mq_dnn_estimator.py        |  4 +-
 test/model/seq2seq/test_model.py              | 37 -------------------
 5 files changed, 3 insertions(+), 49 deletions(-)

diff --git a/src/gluonts/dataset/field_names.py b/src/gluonts/dataset/field_names.py
index d686c4f26f..0e0a6ff7f8 100644
--- a/src/gluonts/dataset/field_names.py
+++ b/src/gluonts/dataset/field_names.py
@@ -27,7 +27,6 @@ class FieldName:
     FEAT_STATIC_REAL = "feat_static_real"
     FEAT_DYNAMIC_CAT = "feat_dynamic_cat"
     FEAT_DYNAMIC_REAL = "feat_dynamic_real"
-    FEAT_DYNAMIC_REAL_LEGACY = "dynamic_feat"
 
     FEAT_DYNAMIC = "feat_dynamic"
 
diff --git a/src/gluonts/dataset/stat.py b/src/gluonts/dataset/stat.py
index bc94d2604d..cfe8914c98 100644
--- a/src/gluonts/dataset/stat.py
+++ b/src/gluonts/dataset/stat.py
@@ -300,8 +300,6 @@ def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics:
             feat_dynamic_real = None
             if FieldName.FEAT_DYNAMIC_REAL in ts:
                 feat_dynamic_real = ts[FieldName.FEAT_DYNAMIC_REAL]
-            elif FieldName.FEAT_DYNAMIC_REAL_LEGACY in ts:
-                feat_dynamic_real = ts[FieldName.FEAT_DYNAMIC_REAL_LEGACY]
 
             if feat_dynamic_real is None:
                 # feat_dynamic_real not found, check it was the first ts we encounter or
diff --git a/src/gluonts/model/seq2seq/_forking_estimator.py b/src/gluonts/model/seq2seq/_forking_estimator.py
index 936782f37e..b0dda76e99 100644
--- a/src/gluonts/model/seq2seq/_forking_estimator.py
+++ b/src/gluonts/model/seq2seq/_forking_estimator.py
@@ -238,14 +238,6 @@ def create_transformation(self) -> Transformation:
             dynamic_feat_fields.append(FieldName.FEAT_AGE)
 
         if self.use_feat_dynamic_real:
-            # Backwards compatibility:
-            chain.append(
-                RenameFields(
-                    {
-                        FieldName.FEAT_DYNAMIC_REAL_LEGACY: FieldName.FEAT_DYNAMIC_REAL
-                    }
-                )
-            )
             dynamic_feat_fields.append(FieldName.FEAT_DYNAMIC_REAL)
 
         # we need to make sure that there is always some dynamic input
diff --git a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
index 5f1a922485..bd31ad9bc6 100644
--- a/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
+++ b/src/gluonts/model/seq2seq/_mq_dnn_estimator.py
@@ -222,7 +222,9 @@ def derive_auto_fields(cls, train_iter):
 
         logger = logging.getLogger(__name__)
         logger.info(
-            f"gluonts[from_inputs]: use_feat_dynamic_real set to '{auto_fields['use_feat_dynamic_real']}', and use use_feat_static_cat to '{auto_fields['use_feat_static_cat']}' with cardinality of '{auto_fields['cardinality']}'"
+            f"gluonts[from_inputs]: use_feat_dynamic_real set to "
+            f"'{auto_fields['use_feat_dynamic_real']}', and use use_feat_static_cat to "
+            f"'{auto_fields['use_feat_static_cat']}' with cardinality of '{auto_fields['cardinality']}'"
         )
 
         return auto_fields
diff --git a/test/model/seq2seq/test_model.py b/test/model/seq2seq/test_model.py
index f9e6beee14..752abf1436 100644
--- a/test/model/seq2seq/test_model.py
+++ b/test/model/seq2seq/test_model.py
@@ -126,40 +126,3 @@ def test_repr(Estimator, repr_test, hyperparameters):
 
 def test_serialize(Estimator, serialize_test, hyperparameters):
     serialize_test(Estimator, hyperparameters)
-
-
-def test_backwards_compatibility():
-    hps = {
-        "freq": "D",
-        "prediction_length": 3,
-        "quantiles": [0.5, 0.1],
-        "epochs": 3,
-        "num_batches_per_epoch": 3,
-        "use_feat_dynamic_real": True,
-        "num_workers": 0,
-    }
-
-    dataset_train, dataset_test = make_dummy_datasets_with_features(
-        cardinality=[3, 10],
-        num_feat_dynamic_real=2,
-        freq=hps["freq"],
-        prediction_length=hps["prediction_length"],
-    )
-
-    for i in range(len(dataset_train)):
-        dataset_train.list_data[i]["dynamic_feat"] = dataset_train.list_data[
-            i
-        ]["feat_dynamic_real"]
-        del dataset_train.list_data[i]["feat_dynamic_real"]
-
-    for i in range(len(dataset_test)):
-        dataset_test.list_data[i]["dynamic_feat"] = dataset_test.list_data[i][
-            "feat_dynamic_real"
-        ]
-        del dataset_test.list_data[i]["feat_dynamic_real"]
-
-    estimator = MQCNNEstimator.from_inputs(dataset_train, **hps)
-
-    predictor = estimator.train(dataset_train, num_workers=0)
-    forecasts = list(predictor.predict(dataset_test))
-    assert len(forecasts) == len(dataset_test)