diff --git a/.gitignore b/.gitignore
index 5f5b099e57..2440df06b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@
 *.pyc
 docs/build
 docs/source/examples
+docs/source/userguide/
+docs/source/quickstart/
 docs/source/README.rst
 docs/source/generated_api
 darts.egg-info/
diff --git a/darts/logging.py b/darts/logging.py
index d62ffd1aea..62494808b8 100644
--- a/darts/logging.py
+++ b/darts/logging.py
@@ -1,3 +1,4 @@
+import warnings
 import logging
 import os
 import time
@@ -31,6 +32,29 @@ def get_logger(name):
     return logger
 
 
+def raise_deprecation_warning(
+    message: str = "",
+    logger: logging.Logger = get_logger("main_logger"),
+):
+    """
+    Raises a DeprecationWarning.
+
+    Parameters
+    ----------
+    message
+        The message of the ValueError.
+    logger
+        The logger instance to log the error message if 'condition' is False.
+
+    Raises
+    ------
+    Warning
+        DeprecationWarning
+    """
+
+    logger.warning("DeprecationWarning: " + message)
+
+
 def raise_if_not(
     condition: bool,
     message: str = "",
@@ -190,3 +214,17 @@ def execute_and_suppress_output(function, logger, suppression_threshold_level, *
     else:
         return_value = function(*args)
     return return_value
+
+
+def suppress_lightning_warnings(suppress_all: bool = False):
+    warnings.filterwarnings(
+        "ignore", ".*You defined a `validation_step` but have no `val_dataloader`.*"
+    )
+    if suppress_all:
+        warnings.filterwarnings(
+            "ignore", ".*does not have many workers which may be a bottleneck.*"
+        )
+        warnings.filterwarnings(
+            "ignore",
+            ".*Trying to infer the `batch_size` from an ambiguous collection.*",
+        )
diff --git a/darts/models/forecasting/block_rnn_model.py b/darts/models/forecasting/block_rnn_model.py
index e5cd49be0b..d8d1a5f109 100644
--- a/darts/models/forecasting/block_rnn_model.py
+++ b/darts/models/forecasting/block_rnn_model.py
@@ -7,32 +7,27 @@
 
 import torch
 import torch.nn as nn
-from numpy.random import RandomState
 
 from darts.logging import get_logger, raise_if_not
-from darts.models.forecasting.torch_forecasting_model import (
-    PastCovariatesTorchModel,
-    TorchParametricProbabilisticForecastingModel,
-)
-from darts.utils.likelihood_models import Likelihood
-from darts.utils.torch import random_method
+from darts.models.forecasting.pl_forecasting_module import PLPastCovariatesModule
+from darts.models.forecasting.torch_forecasting_model import PastCovariatesTorchModel
 
 logger = get_logger(__name__)
 
 
 # TODO add batch norm
-class _BlockRNNModule(nn.Module):
+class _BlockRNNModule(PLPastCovariatesModule):
     def __init__(
         self,
         name: str,
         input_size: int,
         hidden_dim: int,
         num_layers: int,
-        output_chunk_length: int,
         target_size: int,
         nr_params: int,
         num_layers_out_fc: Optional[List] = None,
         dropout: float = 0.0,
+        **kwargs,
     ):
 
         """PyTorch module implementing a block RNN to be used in `BlockRNNModel`.
@@ -58,8 +53,6 @@ def __init__(
             The number of features in the hidden state `h` of the RNN module.
         num_layers
             The number of recurrent layers.
-        output_chunk_length
-            The number of steps to predict in the future.
         target_size
             The dimensionality of the output time series.
         nr_params
@@ -69,6 +62,8 @@ def __init__(
             This network connects the last hidden layer of the PyTorch RNN module to the output.
         dropout
             The fraction of neurons that are dropped in all-but-last RNN layers.
+        **kwargs
+            all parameters required for :class:`darts.model.forecasting_models.PLForecastingModule` base class.
 
         Inputs
         ------
@@ -81,7 +76,10 @@ def __init__(
             Tensor containing the prediction at the last time step of the sequence.
         """
 
-        super().__init__()
+        super().__init__(**kwargs)
+
+        # required for all modules -> saves hparams for checkpoints
+        self.save_hyperparameters()
 
         # Defining parameters
         self.hidden_dim = hidden_dim
@@ -89,7 +87,7 @@ def __init__(
         self.target_size = target_size
         self.nr_params = nr_params
         num_layers_out_fc = [] if num_layers_out_fc is None else num_layers_out_fc
-        self.out_len = output_chunk_length
+        self.out_len = self.output_chunk_length
         self.name = name
 
         # Defining the RNN module
@@ -102,7 +100,7 @@ def __init__(
         last = hidden_dim
         feats = []
         for feature in num_layers_out_fc + [
-            output_chunk_length * target_size * nr_params
+            self.output_chunk_length * target_size * nr_params
         ]:
             feats.append(nn.Linear(last, feature))
             last = feature
@@ -128,10 +126,7 @@ def forward(self, x):
         return predictions
 
 
-class BlockRNNModel(
-    TorchParametricProbabilisticForecastingModel, PastCovariatesTorchModel
-):
-    @random_method
+class BlockRNNModel(PastCovariatesTorchModel):
     def __init__(
         self,
         input_chunk_length: int,
@@ -141,9 +136,7 @@ def __init__(
         n_rnn_layers: int = 1,
         hidden_fc_sizes: Optional[List] = None,
         dropout: float = 0.0,
-        likelihood: Optional[Likelihood] = None,
-        random_state: Optional[Union[int, RandomState]] = None,
-        **kwargs
+        **kwargs,
     ):
 
         """Block Recurrent Neural Network Model (RNNs).
@@ -179,64 +172,43 @@ def __init__(
             Sizes of hidden layers connecting the last hidden layer of the RNN module to the output, if any.
         dropout
             Fraction of neurons afected by Dropout.
-        likelihood
-            Optionally, the likelihood model to be used for probabilistic forecasts.
-            If no likelihood model is provided, forecasts will be deterministic.
-        random_state
-            Control the randomness of the weights initialization. Check this
-            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
-
-        batch_size
-            Number of time series (input and output sequences) used in each training pass.
-        n_epochs
-            Number of epochs over which to train the model.
-        add_encoders
-            A large number of past and future covariates can be automatically generated with `add_encoders`.
-            This can be done by adding mutliple pre-defined index encoders and/or custom user-made functions that
-            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
-            transform the generated covariates. This happens all under one hood and only needs to be specified at
-            model creation.
-            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
-            `add_encoders`. An example showing some of `add_encoders` features:
-
-            .. highlight:: python
-            .. code-block:: python
+        **kwargs
+            Optional arguments to initialize the pytorch_lightning.Module, pytorch_lightning.Trainer, and
+            Darts' :class:`TorchForecastingModel`.
 
-                add_encoders={
-                    'cyclic': {'future': ['month']},
-                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
-                    'position': {'past': ['absolute'], 'future': ['relative']},
-                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
-                    'transformer': Scaler()
-                }
-            ..
+        loss_fn
+            PyTorch loss function used for training.
+            This parameter will be ignored for probabilistic models if the ``likelihood`` parameter is specified.
+            Default: ``torch.nn.MSELoss()``.
+        likelihood
+            The likelihood model to be used for probabilistic forecasts.
         optimizer_cls
-            The PyTorch optimizer class to be used (default: `torch.optim.Adam`).
+            The PyTorch optimizer class to be used (default: ``torch.optim.Adam``).
         optimizer_kwargs
             Optionally, some keyword arguments for the PyTorch optimizer (e.g., ``{'lr': 1e-3}``
-            for specifying a learning rate). Otherwise the default values of the selected `optimizer_cls`
+            for specifying a learning rate). Otherwise the default values of the selected ``optimizer_cls``
             will be used.
         lr_scheduler_cls
-            Optionally, the PyTorch learning rate scheduler class to be used. Specifying `None` corresponds
+            Optionally, the PyTorch learning rate scheduler class to be used. Specifying ``None`` corresponds
             to using a constant learning rate.
         lr_scheduler_kwargs
-            Optionally, some keyword arguments for the PyTorch optimizer.
-        loss_fn
-            PyTorch loss function used for training.
-            This parameter will be ignored for probabilistic models if the `likelihood` parameter is specified.
-            Default: ``torch.nn.MSELoss()``.
+            Optionally, some keyword arguments for the PyTorch learning rate scheduler.
+        batch_size
+            Number of time series (input and output sequences) used in each training pass.
+        n_epochs
+            Number of epochs over which to train the model.
         model_name
             Name of the model. Used for creating checkpoints and saving tensorboard data. If not specified,
-            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part of
-            the name is formatted with the local date and time, while PID is the processed ID (preventing models spawned
-            at the same time by different processes to share the same model_name). E.g.,
+            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part
+            of the name is formatted with the local date and time, while PID is the processed ID (preventing models
+            spawned at the same time by different processes to share the same model_name). E.g.,
             ``"2021-06-14_09:53:32_torch_model_run_44607"``.
         work_dir
             Path of the working directory, where to save checkpoints and Tensorboard summaries.
             (default: current working directory).
         log_tensorboard
             If set, use Tensorboard to log the different parameters. The logs will be located in:
-            `[work_dir]/.darts/runs/`.
+            ``"{work_dir}/darts_logs/{model_name}/logs/"``.
         nr_epochs_val_period
             Number of epochs to wait before evaluating the validation loss (if a validation
             ``TimeSeries`` is passed to the :func:`fit()` method).
@@ -244,17 +216,79 @@ def __init__(
             Optionally, a string indicating the torch device to use. (default: "cuda:0" if a GPU
             is available, otherwise "cpu")
         force_reset
-            If set to `True`, any previously-existing model with the same name will be reset (all checkpoints will
+            If set to ``True``, any previously-existing model with the same name will be reset (all checkpoints will
             be discarded).
         save_checkpoints
             Whether or not to automatically save the untrained model and checkpoints from training.
-            If set to `False`, the model can still be manually saved using :func:`save_model()`
-            and loaded using :func:`load_model()`.
+            To load the model from checkpoint, call :func:`MyModelClass.load_from_checkpoint()`, where
+            :class:`MyModelClass` is the :class:`TorchForecastingModel` class that was used (such as :class:`TFTModel`,
+            :class:`NBEATSModel`, etc.). If set to ``False``, the model can still be manually saved using
+            :func:`save_model()` and loaded using :func:`load_model()`.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
+        random_state
+            Control the randomness of the weights initialization. Check this
+            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
+        pl_trainer_kwargs
+            By default :class:`TorchForecastingModel` creates a PyTorch Lightning Trainer with several useful presets
+            that performs the training, validation and prediction processes. These presets include automatic
+            checkpointing, tensorboard logging, setting the torch device and more.
+            With ``pl_trainer_kwargs`` you can add additional kwargs to instantiate the PyTorch Lightning trainer
+            object. Check the `PL Trainer documentation
+            <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ for more information about the
+            supported kwargs.
+            With parameter ``"callbacks"`` you can add custom or PyTorch-Lightning built-in callbacks to Darts'
+            :class:`TorchForecastingModel`. Below is an example for adding EarlyStopping to the training process.
+            The model will stop training early if the validation loss `val_loss` does not improve beyond
+            specifications. For more information on callbacks, visit:
+            `PyTorch Lightning Callbacks
+            <https://pytorch-lightning.readthedocs.io/en/stable/extensions/callbacks.html>`_
+
+            .. highlight:: python
+            .. code-block:: python
+
+                from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+                # stop training when validation loss does not decrease more than 0.05 (`min_delta`) over
+                # a period of 5 epochs (`patience`)
+                my_stopper = EarlyStopping(
+                    monitor="val_loss",
+                    patience=5,
+                    min_delta=0.05,
+                    mode='min',
+                )
+
+                pl_trainer_kwargs={"callbacks": [my_stopper]}
+            ..
+
+            Note that you can also use a custom PyTorch Lightning Trainer for training and prediction with optional
+            parameter ``trainer`` in :func:`fit()` and :func:`predict()`.
+        show_warnings
+            whether to show warnings raised from PyTorch Lightning. Useful to detect potential issues of
+            your forecasting use case.
         """
+        super().__init__(**self._extract_torch_model_params(**self.model_params))
 
-        kwargs["input_chunk_length"] = input_chunk_length
-        kwargs["output_chunk_length"] = output_chunk_length
-        super().__init__(likelihood=likelihood, **kwargs)
+        # extract pytorch lightning module kwargs
+        self.pl_module_params = self._extract_pl_module_params(**self.model_params)
 
         # check we got right model type specified:
         if model not in ["RNN", "LSTM", "GRU"]:
@@ -267,8 +301,6 @@ def __init__(
                 logger,
             )
 
-        self.input_chunk_length = input_chunk_length
-        self.output_chunk_length = output_chunk_length
         self.rnn_type_or_module = model
         self.hidden_fc_sizes = hidden_fc_sizes
         self.hidden_size = hidden_size
@@ -294,18 +326,10 @@ def _create_model(self, train_sample: Tuple[torch.Tensor]) -> torch.nn.Module:
                 nr_params=nr_params,
                 hidden_dim=self.hidden_size,
                 num_layers=self.n_rnn_layers,
-                output_chunk_length=self.output_chunk_length,
                 num_layers_out_fc=hidden_fc_sizes,
                 dropout=self.dropout,
+                **self.pl_module_params,
             )
         else:
             model = self.rnn_type_or_module
         return model
-
-    @random_method
-    def _produce_predict_output(self, x):
-        if self.likelihood:
-            output = self.model(x)
-            return self.likelihood.sample(output)
-        else:
-            return self.model(x).squeeze(dim=-1)
diff --git a/darts/models/forecasting/forecasting_model.py b/darts/models/forecasting/forecasting_model.py
index 5afed3d20a..d0346f2034 100644
--- a/darts/models/forecasting/forecasting_model.py
+++ b/darts/models/forecasting/forecasting_model.py
@@ -15,6 +15,7 @@
 import inspect
 import time
 from abc import ABC, ABCMeta, abstractmethod
+from collections import OrderedDict
 from itertools import product
 from random import sample
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
@@ -31,15 +32,64 @@
     _parallel_apply,
     _with_sanity_checks,
 )
-from darts.utils.timeseries_generation import _generate_index
+from darts.utils.timeseries_generation import (
+    _build_forecast_series,
+    _generate_new_dates,
+)
 
 logger = get_logger(__name__)
 
 
 class ModelMeta(ABCMeta):
+    """Meta class to store parameters used at model creation.
+
+    When creating a model instance, the parameters are extracted as follows:
+
+        1)  Get the model's __init__ signature and store all arg and kwarg
+            names as well as default values (empty for args) in an ordered
+            dict `all_params`.
+        2)  Replace the arg values from `all_params` with the positional
+            args used at model creation.
+        3)  Remove args from `all_params` that were not passed as positional
+            args at model creation. This will enforce that an error is raised
+            if not all positional args were passed. If all positional args
+            were passed, no parameter will be removed.
+        4)  Update `all_params` kwargs with optional kwargs from model creation.
+        5)  Save `all_params` to the model.
+        6)  Call (create) the model with `all_params`.
+    """
+
     def __call__(cls, *args, **kwargs):
-        cls.model_call = (args, kwargs)
-        return super().__call__(*args, **kwargs)
+        # 1) get all default values from class' __init__ signature
+        sig = inspect.signature(cls.__init__)
+        all_params = OrderedDict(
+            [
+                (p.name, p.default)
+                for p in sig.parameters.values()
+                if not p.name == "self"
+            ]
+        )
+
+        # 2) fill params with positional args
+        for param, arg in zip(all_params, args):
+            all_params[param] = arg
+
+        # 3) remove args which were not set (and are per default empty)
+        remove_params = []
+        for param, val in all_params.items():
+            if val is sig.parameters[param].empty:
+                remove_params.append(param)
+        for param in remove_params:
+            all_params.pop(param)
+
+        # 4) update defaults with actual model call parameters and store
+        all_params.update(kwargs)
+
+        # 5) save parameters in model
+        cls._model_call = all_params
+
+        # 6) call model
+        return super().__call__(**all_params)
 
 
 class ForecastingModel(ABC, metaclass=ModelMeta):
@@ -191,14 +241,7 @@ def _generate_new_dates(
         input_series = (
             input_series if input_series is not None else self.training_series
         )
-
-        last = input_series.end_time()
-        start = (
-            last + input_series.freq if input_series.has_datetime_index else last + 1
-        )
-        return _generate_index(
-            start=start, freq=input_series.freq, length=n, name=input_series.time_dim
-        )
+        return _generate_new_dates(n=n, input_series=input_series)
 
     def _build_forecast_series(
         self,
@@ -212,28 +255,7 @@ def _build_forecast_series(
         input_series = (
             input_series if input_series is not None else self.training_series
         )
-        time_index_length = (
-            len(points_preds)
-            if isinstance(points_preds, np.ndarray)
-            else len(points_preds[0])
-        )
-        time_index = self._generate_new_dates(
-            time_index_length, input_series=input_series
-        )
-        if isinstance(points_preds, np.ndarray):
-            return TimeSeries.from_times_and_values(
-                time_index,
-                points_preds,
-                freq=input_series.freq_str,
-                columns=input_series.columns,
-            )
-
-        return TimeSeries.from_times_and_values(
-            time_index,
-            np.stack(points_preds, axis=2),
-            freq=input_series.freq_str,
-            columns=input_series.columns,
-        )
+        return _build_forecast_series(points_preds, input_series)
 
     def _historical_forecasts_sanity_checks(self, *args: Any, **kwargs: Any) -> None:
         """Sanity checks for the historical_forecasts function
@@ -817,13 +839,18 @@ def _sample_params(model_class, params, n_random_samples):
 
     def _extract_model_creation_params(self):
         """extracts immutable model creation parameters from `ModelMeta` and deletes reference."""
-        model_params = copy.deepcopy(self.model_call)
-        del self.__class__.model_call
+        model_params = copy.deepcopy(self._model_call)
+        del self.__class__._model_call
         return model_params
 
     def untrained_model(self):
-        args, kwargs = self._model_params
-        return self.__class__(*args, **kwargs)
+        return self.__class__(**self.model_params)
+
+    @property
+    def model_params(self) -> dict:
+        return (
+            self._model_params if hasattr(self, "_model_params") else self._model_call
+        )
 
 
 class GlobalForecastingModel(ForecastingModel, ABC):
diff --git a/darts/models/forecasting/nbeats.py b/darts/models/forecasting/nbeats.py
index fea63637a9..c3bf925263 100644
--- a/darts/models/forecasting/nbeats.py
+++ b/darts/models/forecasting/nbeats.py
@@ -4,20 +4,15 @@
 """
 
 from enum import Enum
-from typing import List, NewType, Optional, Tuple, Union
+from typing import List, NewType, Tuple, Union
 
 import numpy as np
 import torch
 import torch.nn as nn
-from numpy.random import RandomState
 
 from darts.logging import get_logger, raise_if_not, raise_log
-from darts.models.forecasting.torch_forecasting_model import (
-    PastCovariatesTorchModel,
-    TorchParametricProbabilisticForecastingModel,
-)
-from darts.utils.likelihood_models import Likelihood
-from darts.utils.torch import random_method
+from darts.models.forecasting.pl_forecasting_module import PLPastCovariatesModule
+from darts.models.forecasting.torch_forecasting_model import PastCovariatesTorchModel
 
 logger = get_logger(__name__)
 
@@ -302,14 +297,12 @@ def forward(self, x):
         return stack_residual, stack_forecast
 
 
-class _NBEATSModule(nn.Module):
+class _NBEATSModule(PLPastCovariatesModule):
     def __init__(
         self,
         input_dim: int,
         output_dim: int,
         nr_params: int,
-        input_chunk_length: int,
-        output_chunk_length: int,
         generic_architecture: bool,
         num_stacks: int,
         num_blocks: int,
@@ -317,6 +310,7 @@ def __init__(
         layer_widths: List[int],
         expansion_coefficient_dim: int,
         trend_polynomial_degree: int,
+        **kwargs
     ):
         """PyTorch module implementing the N-BEATS architecture.
 
@@ -326,10 +320,6 @@ def __init__(
             Number of output components in the target
         nr_params
             The number of parameters of the likelihood (or 1 if no likelihood is used).
-        input_chunk_length
-            The length of the input sequence fed to the model.
-        output_chunk_length
-            The length of the forecast of the model.
         generic_architecture
             Boolean value indicating whether the generic architecture of N-BEATS is used.
             If not, the interpretable architecture outlined in the paper (consisting of one trend
@@ -352,6 +342,8 @@ def __init__(
         trend_polynomial_degree
             The degree of the polynomial used as waveform generator in trend stacks. Only used if
             `generic_architecture` is set to `False`.
+        **kwargs
+            all parameters required for :class:`darts.model.forecasting_models.PLForecastingModule` base class.
 
         Inputs
         ------
@@ -364,14 +356,16 @@ def __init__(
             Tensor containing the output of the NBEATS module.
 
         """
-        super().__init__()
+        super().__init__(**kwargs)
+
+        # required for all modules -> saves hparams for checkpoints
+        self.save_hyperparameters()
 
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.nr_params = nr_params
-        self.input_chunk_length_multi = input_chunk_length * input_dim
-        self.output_chunk_length = output_chunk_length
-        self.target_length = output_chunk_length * input_dim
+        self.input_chunk_length_multi = self.input_chunk_length * input_dim
+        self.target_length = self.output_chunk_length * input_dim
 
         if generic_architecture:
             self.stacks_list = [
@@ -457,10 +451,7 @@ def forward(self, x):
         return y
 
 
-class NBEATSModel(
-    TorchParametricProbabilisticForecastingModel, PastCovariatesTorchModel
-):
-    @random_method
+class NBEATSModel(PastCovariatesTorchModel):
     def __init__(
         self,
         input_chunk_length: int,
@@ -472,8 +463,6 @@ def __init__(
         layer_widths: Union[int, List[int]] = 256,
         expansion_coefficient_dim: int = 5,
         trend_polynomial_degree: int = 2,
-        likelihood: Optional[Likelihood] = None,
-        random_state: Optional[Union[int, RandomState]] = None,
         **kwargs
     ):
         """Neural Basis Expansion Analysis Time Series Forecasting (N-BEATS).
@@ -516,64 +505,43 @@ def __init__(
         trend_polynomial_degree
             The degree of the polynomial used as waveform generator in trend stacks. Only used if
             `generic_architecture` is set to `False`.
-        likelihood
-            Optionally, the likelihood model to be used for probabilistic forecasts.
-            If no likelihood model is provided, forecasts will be deterministic.
-        random_state
-            Control the randomness of the weights initialization. Check this
-            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
-
-        batch_size
-            Number of time series (input and output sequences) used in each training pass.
-        n_epochs
-            Number of epochs over which to train the model.
-        add_encoders
-            A large number of past and future covariates can be automatically generated with `add_encoders`.
-            This can be done by adding mutliple pre-defined index encoders and/or custom user-made functions that
-            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
-            transform the generated covariates. This happens all under one hood and only needs to be specified at
-            model creation.
-            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
-            `add_encoders`. An example showing some of `add_encoders` features:
-
-            .. highlight:: python
-            .. code-block:: python
+        **kwargs
+            Optional arguments to initialize the pytorch_lightning.Module, pytorch_lightning.Trainer, and
+            Darts' :class:`TorchForecastingModel`.
 
-                add_encoders={
-                    'cyclic': {'future': ['month']},
-                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
-                    'position': {'past': ['absolute'], 'future': ['relative']},
-                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
-                    'transformer': Scaler()
-                }
-            ..
+        loss_fn
+            PyTorch loss function used for training.
+            This parameter will be ignored for probabilistic models if the ``likelihood`` parameter is specified.
+            Default: ``torch.nn.MSELoss()``.
+        likelihood
+            The likelihood model to be used for probabilistic forecasts.
         optimizer_cls
-            The PyTorch optimizer class to be used (default: `torch.optim.Adam`).
+            The PyTorch optimizer class to be used (default: ``torch.optim.Adam``).
         optimizer_kwargs
             Optionally, some keyword arguments for the PyTorch optimizer (e.g., ``{'lr': 1e-3}``
-            for specifying a learning rate). Otherwise the default values of the selected `optimizer_cls`
+            for specifying a learning rate). Otherwise the default values of the selected ``optimizer_cls``
             will be used.
         lr_scheduler_cls
-            Optionally, the PyTorch learning rate scheduler class to be used. Specifying `None` corresponds
+            Optionally, the PyTorch learning rate scheduler class to be used. Specifying ``None`` corresponds
             to using a constant learning rate.
         lr_scheduler_kwargs
-            Optionally, some keyword arguments for the PyTorch optimizer.
-        loss_fn
-            PyTorch loss function used for training.
-            This parameter will be ignored for probabilistic models if the `likelihood` parameter is specified.
-            Default: ``torch.nn.MSELoss()``.
+            Optionally, some keyword arguments for the PyTorch learning rate scheduler.
+        batch_size
+            Number of time series (input and output sequences) used in each training pass.
+        n_epochs
+            Number of epochs over which to train the model.
         model_name
             Name of the model. Used for creating checkpoints and saving tensorboard data. If not specified,
-            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part of
-            the name is formatted with the local date and time, while PID is the processed ID (preventing models spawned
-            at the same time by different processes to share the same model_name). E.g.,
+            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part
+            of the name is formatted with the local date and time, while PID is the processed ID (preventing models
+            spawned at the same time by different processes to share the same model_name). E.g.,
             ``"2021-06-14_09:53:32_torch_model_run_44607"``.
         work_dir
             Path of the working directory, where to save checkpoints and Tensorboard summaries.
             (default: current working directory).
         log_tensorboard
             If set, use Tensorboard to log the different parameters. The logs will be located in:
-            `[work_dir]/.darts/runs/`.
+            ``"{work_dir}/darts_logs/{model_name}/logs/"``.
         nr_epochs_val_period
             Number of epochs to wait before evaluating the validation loss (if a validation
             ``TimeSeries`` is passed to the :func:`fit()` method).
@@ -581,21 +549,83 @@ def __init__(
             Optionally, a string indicating the torch device to use. (default: "cuda:0" if a GPU
             is available, otherwise "cpu")
         force_reset
-            If set to `True`, any previously-existing model with the same name will be reset (all checkpoints will
+            If set to ``True``, any previously-existing model with the same name will be reset (all checkpoints will
             be discarded).
         save_checkpoints
             Whether or not to automatically save the untrained model and checkpoints from training.
-            If set to `False`, the model can still be manually saved using :func:`save_model()`
-            and loaded using :func:`load_model()`.
+            To load the model from checkpoint, call :func:`MyModelClass.load_from_checkpoint()`, where
+            :class:`MyModelClass` is the :class:`TorchForecastingModel` class that was used (such as :class:`TFTModel`,
+            :class:`NBEATSModel`, etc.). If set to ``False``, the model can still be manually saved using
+            :func:`save_model()` and loaded using :func:`load_model()`.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
+        random_state
+            Control the randomness of the weights initialization. Check this
+            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
+        pl_trainer_kwargs
+            By default :class:`TorchForecastingModel` creates a PyTorch Lightning Trainer with several useful presets
+            that performs the training, validation and prediction processes. These presets include automatic
+            checkpointing, tensorboard logging, setting the torch device and more.
+            With ``pl_trainer_kwargs`` you can add additional kwargs to instantiate the PyTorch Lightning trainer
+            object. Check the `PL Trainer documentation
+            <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ for more information about the
+            supported kwargs.
+            With parameter ``"callbacks"`` you can add custom or PyTorch-Lightning built-in callbacks to Darts'
+            :class:`TorchForecastingModel`. Below is an example for adding EarlyStopping to the training process.
+            The model will stop training early if the validation loss `val_loss` does not improve beyond
+            specifications. For more information on callbacks, visit:
+            `PyTorch Lightning Callbacks
+            <https://pytorch-lightning.readthedocs.io/en/stable/extensions/callbacks.html>`_
+
+            .. highlight:: python
+            .. code-block:: python
+
+                from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+                # stop training when validation loss does not decrease more than 0.05 (`min_delta`) over
+                # a period of 5 epochs (`patience`)
+                my_stopper = EarlyStopping(
+                    monitor="val_loss",
+                    patience=5,
+                    min_delta=0.05,
+                    mode='min',
+                )
+
+                pl_trainer_kwargs={"callbacks": [my_stopper]}
+            ..
+
+            Note that you can also use a custom PyTorch Lightning Trainer for training and prediction with optional
+            parameter ``trainer`` in :func:`fit()` and :func:`predict()`.
+        show_warnings
+            whether to show warnings raised from PyTorch Lightning. Useful to detect potential issues of
+            your forecasting use case.
 
         References
         ----------
         .. [1] https://openreview.net/forum?id=r1ecqn4YwB
         """
+        super().__init__(**self._extract_torch_model_params(**self.model_params))
 
-        kwargs["input_chunk_length"] = input_chunk_length
-        kwargs["output_chunk_length"] = output_chunk_length
-        super().__init__(likelihood=likelihood, **kwargs)
+        # extract pytorch lightning module kwargs
+        self.pl_module_params = self._extract_pl_module_params(**self.model_params)
 
         raise_if_not(
             isinstance(layer_widths, int) or len(layer_widths) == num_stacks,
@@ -604,8 +634,6 @@ def __init__(
             logger,
         )
 
-        self.input_chunk_length = input_chunk_length
-        self.output_chunk_length = output_chunk_length
         self.generic_architecture = generic_architecture
         self.num_stacks = num_stacks
         self.num_blocks = num_blocks
@@ -632,8 +660,6 @@ def _create_model(self, train_sample: Tuple[torch.Tensor]) -> torch.nn.Module:
             input_dim=input_dim,
             output_dim=output_dim,
             nr_params=nr_params,
-            input_chunk_length=self.input_chunk_length,
-            output_chunk_length=self.output_chunk_length,
             generic_architecture=self.generic_architecture,
             num_stacks=self.num_stacks,
             num_blocks=self.num_blocks,
@@ -641,12 +667,5 @@ def _create_model(self, train_sample: Tuple[torch.Tensor]) -> torch.nn.Module:
             layer_widths=self.layer_widths,
             expansion_coefficient_dim=self.expansion_coefficient_dim,
             trend_polynomial_degree=self.trend_polynomial_degree,
+            **self.pl_module_params,
         )
-
-    @random_method
-    def _produce_predict_output(self, x):
-        if self.likelihood:
-            output = self.model(x)
-            return self.likelihood.sample(output)
-        else:
-            return self.model(x).squeeze(dim=-1)
diff --git a/darts/models/forecasting/pl_forecasting_module.py b/darts/models/forecasting/pl_forecasting_module.py
new file mode 100644
index 0000000000..a87c8a96e4
--- /dev/null
+++ b/darts/models/forecasting/pl_forecasting_module.py
@@ -0,0 +1,640 @@
+"""
+This file contains abstract classes for deterministic and probabilistic PyTorch Lightning Modules
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Sequence, Tuple
+
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from joblib import delayed, Parallel
+
+from darts.logging import get_logger, raise_if, raise_log
+from darts.timeseries import TimeSeries
+from darts.utils.likelihood_models import Likelihood
+from darts.utils.timeseries_generation import _build_forecast_series
+
+
+logger = get_logger(__name__)
+
+
+class PLForecastingModule(pl.LightningModule, ABC):
+    @abstractmethod
+    def __init__(
+        self,
+        input_chunk_length: int,
+        output_chunk_length: int,
+        loss_fn: nn.modules.loss._Loss = nn.MSELoss(),
+        likelihood: Optional[Likelihood] = None,
+        optimizer_cls: torch.optim.Optimizer = torch.optim.Adam,
+        optimizer_kwargs: Optional[Dict] = None,
+        lr_scheduler_cls: torch.optim.lr_scheduler._LRScheduler = None,
+        lr_scheduler_kwargs: Optional[Dict] = None,
+    ) -> None:
+        """
+        PyTorch Lightning-based Forecasting Module.
+
+        This class is meant to be inherited to create a new PyTorch Lightning-based forecasting module.
+        When subclassing this class, please make sure to add the following methods with the given signatures:
+            - :func:`PLTorchForecastingModel.__init__()`
+            - :func:`PLTorchForecastingModel.forward()`
+            - :func:`PLTorchForecastingModel._produce_train_output()`
+            - :func:`PLTorchForecastingModel._get_batch_prediction()`
+
+        In subclass `MyModel`'s :func:`__init__` function call ``super(MyModel, self).__init__(**kwargs)`` where
+        ``kwargs`` are the parameters of :class:`PLTorchForecastingModel`.
+
+        Parameters
+        ----------
+        input_chunk_length
+            Number of input past time steps per chunk.
+        output_chunk_length
+            Number of output time steps per chunk.
+        loss_fn
+            PyTorch loss function used for training.
+            This parameter will be ignored for probabilistic models if the ``likelihood`` parameter is specified.
+            Default: ``torch.nn.MSELoss()``.
+        likelihood
+            The likelihood model to be used for probabilistic forecasts.
+        optimizer_cls
+            The PyTorch optimizer class to be used (default: ``torch.optim.Adam``).
+        optimizer_kwargs
+            Optionally, some keyword arguments for the PyTorch optimizer (e.g., ``{'lr': 1e-3}``
+            for specifying a learning rate). Otherwise the default values of the selected ``optimizer_cls``
+            will be used.
+        lr_scheduler_cls
+            Optionally, the PyTorch learning rate scheduler class to be used. Specifying ``None`` corresponds
+            to using a constant learning rate.
+        lr_scheduler_kwargs
+            Optionally, some keyword arguments for the PyTorch learning rate scheduler.
+        """
+        super().__init__()
+
+        raise_if(
+            input_chunk_length is None or output_chunk_length is None,
+            "Both `input_chunk_length` and `output_chunk_length` must be passed to `PLForecastingModule`",
+            logger,
+        )
+
+        self.input_chunk_length = input_chunk_length
+        self.output_chunk_length = output_chunk_length
+
+        # define the loss function
+        self.criterion = loss_fn
+        # by default models are deterministic (i.e. not probabilistic)
+        self.likelihood = likelihood
+
+        # persist optimiser and LR scheduler parameters
+        self.optimizer_cls = optimizer_cls
+        self.optimizer_kwargs = dict() if optimizer_kwargs is None else optimizer_kwargs
+        self.lr_scheduler_cls = lr_scheduler_cls
+        self.lr_scheduler_kwargs = (
+            dict() if lr_scheduler_kwargs is None else lr_scheduler_kwargs
+        )
+
+        # initialize prediction parameters
+        self.pred_n: Optional[int] = None
+        self.pred_num_samples: Optional[int] = None
+        self.pred_roll_size: Optional[int] = None
+        self.pred_batch_size: Optional[int] = None
+        self.pred_n_jobs: Optional[int] = None
+
+    @property
+    def first_prediction_index(self) -> int:
+        """
+        Returns the index of the first predicted within the output of self.model.
+        """
+        return 0
+
+    @abstractmethod
+    def forward(self, *args, **kwargs) -> Any:
+        super().forward(*args, **kwargs)
+
+    def training_step(self, train_batch, batch_idx) -> torch.Tensor:
+        """performs the training step"""
+        output = self._produce_train_output(train_batch[:-1])
+        target = train_batch[
+            -1
+        ]  # By convention target is always the last element returned by datasets
+        loss = self._compute_loss(output, target)
+        self.log("train_loss", loss, batch_size=train_batch[0].shape[0])
+        return loss
+
+    def validation_step(self, val_batch, batch_idx) -> torch.Tensor:
+        """performs the validation step"""
+        output = self._produce_train_output(val_batch[:-1])
+        target = val_batch[-1]
+        loss = self._compute_loss(output, target)
+        self.log("val_loss", loss, batch_size=val_batch[0].shape[0])
+        return loss
+
+    def predict_step(
+        self, batch: Tuple, batch_idx: int, dataloader_idx: Optional[int] = None
+    ) -> Sequence[TimeSeries]:
+        """performs the prediction step
+
+        batch
+            output of Darts' :class:`InferenceDataset` - tuple of ``(past_target, past_covariates,
+            historic_future_covariates, future_covariates, future_past_covariates, input_timeseries)``
+        batch_idx
+            the batch index of the current batch
+        dataloader_idx
+            the dataloader index
+        """
+        input_data_tuple, batch_input_series = batch[:-1], batch[-1]
+
+        # number of individual series to be predicted in current batch
+        num_series = input_data_tuple[0].shape[0]
+
+        # number of of times the input tensor should be tiled to produce predictions for multiple samples
+        # this variable is larger than 1 only if the batch_size is at least twice as large as the number
+        # of individual time series being predicted in current batch (`num_series`)
+        batch_sample_size = min(
+            max(self.pred_batch_size // num_series, 1), self.pred_num_samples
+        )
+
+        # counts number of produced prediction samples for every series to be predicted in current batch
+        sample_count = 0
+
+        # repeat prediction procedure for every needed sample
+        batch_predictions = []
+        while sample_count < self.pred_num_samples:
+
+            # make sure we don't produce too many samples
+            if sample_count + batch_sample_size > self.pred_num_samples:
+                batch_sample_size = self.pred_num_samples - sample_count
+
+            # stack multiple copies of the tensors to produce probabilistic forecasts
+            input_data_tuple_samples = self._sample_tiling(
+                input_data_tuple, batch_sample_size
+            )
+
+            # get predictions for 1 whole batch (can include predictions of multiple series
+            # and for multiple samples if a probabilistic forecast is produced)
+            batch_prediction = self._get_batch_prediction(
+                self.pred_n, input_data_tuple_samples, self.pred_roll_size
+            )
+
+            # reshape from 3d tensor (num_series x batch_sample_size, ...)
+            # into 4d tensor (batch_sample_size, num_series, ...), where dim 0 represents the samples
+            out_shape = batch_prediction.shape
+            batch_prediction = batch_prediction.reshape(
+                (
+                    batch_sample_size,
+                    num_series,
+                )
+                + out_shape[1:]
+            )
+
+            # save all predictions and update the `sample_count` variable
+            batch_predictions.append(batch_prediction)
+            sample_count += batch_sample_size
+
+        # concatenate the batch of samples, to form self.pred_num_samples samples
+        batch_predictions = torch.cat(batch_predictions, dim=0)
+        batch_predictions = batch_predictions.cpu().detach().numpy()
+
+        ts_forecasts = Parallel(n_jobs=self.pred_n_jobs)(
+            delayed(_build_forecast_series)(
+                [batch_prediction[batch_idx] for batch_prediction in batch_predictions],
+                input_series,
+            )
+            for batch_idx, input_series in enumerate(batch_input_series)
+        )
+        return ts_forecasts
+
+    def set_predict_parameters(
+        self, n: int, num_samples: int, roll_size: int, batch_size: int, n_jobs: int
+    ) -> None:
+        """to be set from TorchForecastingModel before calling trainer.predict() and reset at self.on_predict_end()"""
+        self.pred_n = n
+        self.pred_num_samples = num_samples
+        self.pred_roll_size = roll_size
+        self.pred_batch_size = batch_size
+        self.pred_n_jobs = n_jobs
+
+    def _compute_loss(self, output, target):
+        # output is of shape (batch_size, n_timesteps, n_components, n_params)
+        if self.likelihood:
+            return self.likelihood.compute_loss(output, target)
+        else:
+            # If there's no likelihood, nr_params=1 and we need to squeeze out the
+            # last dimension of model output, for properly computing the loss.
+            return self.criterion(output.squeeze(dim=-1), target)
+
+    def configure_optimizers(self):
+        """configures optimizers and learning rate schedulers for for model optimization."""
+
+        # A utility function to create optimizer and lr scheduler from desired classes
+        def _create_from_cls_and_kwargs(cls, kws):
+            try:
+                return cls(**kws)
+            except (TypeError, ValueError) as e:
+                raise_log(
+                    ValueError(
+                        "Error when building the optimizer or learning rate scheduler;"
+                        "please check the provided class and arguments"
+                        "\nclass: {}"
+                        "\narguments (kwargs): {}"
+                        "\nerror:\n{}".format(cls, kws, e)
+                    ),
+                    logger,
+                )
+
+        # Create the optimizer and (optionally) the learning rate scheduler
+        # we have to create copies because we cannot save model.parameters into object state (not serializable)
+        optimizer_kws = {k: v for k, v in self.optimizer_kwargs.items()}
+        optimizer_kws["params"] = self.parameters()
+
+        optimizer = _create_from_cls_and_kwargs(self.optimizer_cls, optimizer_kws)
+
+        if self.lr_scheduler_cls is not None:
+            lr_sched_kws = {k: v for k, v in self.lr_scheduler_kwargs.items()}
+            lr_sched_kws["optimizer"] = optimizer
+            lr_scheduler = _create_from_cls_and_kwargs(
+                self.lr_scheduler_cls, lr_sched_kws
+            )
+            return [optimizer], [lr_scheduler]
+        else:
+            return optimizer
+
+    @abstractmethod
+    def _produce_train_output(self, input_batch: Tuple) -> torch.Tensor:
+        pass
+
+    @abstractmethod
+    def _get_batch_prediction(
+        self, n: int, input_batch: Tuple, roll_size: int
+    ) -> torch.Tensor:
+        """
+        In charge of applying the recurrent logic for non-recurrent models.
+        Should be overwritten by recurrent models.
+        """
+        pass
+
+    @staticmethod
+    def _sample_tiling(input_data_tuple, batch_sample_size):
+        tiled_input_data = []
+        for tensor in input_data_tuple:
+            if tensor is not None:
+                tiled_input_data.append(tensor.tile((batch_sample_size, 1, 1)))
+            else:
+                tiled_input_data.append(None)
+        return tuple(tiled_input_data)
+
+    def _is_probabilistic(self) -> bool:
+        return self.likelihood is not None
+
+    def _produce_predict_output(self, x):
+        if self.likelihood:
+            output = self(x)
+            return self.likelihood.sample(output)
+        else:
+            return self(x).squeeze(dim=-1)
+
+    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        # we must save the dtype for correct parameter precision at loading time
+        checkpoint["model_dtype"] = self.dtype
+
+    def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        # by default our models are initialized as float32. For other dtypes, we need to cast to the correct precision
+        # before parameters are loaded by PyTorch-Lightning
+        dtype = checkpoint["model_dtype"]
+        if dtype == torch.float16:
+            self.half()
+        if dtype == torch.float32:
+            self.float()
+        elif dtype == torch.float64:
+            self.double()
+        else:
+            raise_if(
+                True,
+                f"Trying to load dtype {dtype}. Loading for this type is not implemented yet. Please report this "
+                f"issue on https://github.com/unit8co/darts",
+                logger,
+            )
+
+    @property
+    def epochs_trained(self):
+        # trained epochs are only 0 when global step and current epoch are 0, else current epoch + 1
+        current_epoch = self.current_epoch
+        if self.current_epoch or self.global_step:
+            current_epoch += 1
+        return current_epoch
+
+
+class PLPastCovariatesModule(PLForecastingModule, ABC):
+    def _produce_train_output(self, input_batch: Tuple):
+        past_target, past_covariate = input_batch
+        # Currently all our PastCovariates models require past target and covariates concatenated
+        inpt = (
+            torch.cat([past_target, past_covariate], dim=2)
+            if past_covariate is not None
+            else past_target
+        )
+        return self(inpt)
+
+    def _get_batch_prediction(
+        self, n: int, input_batch: Tuple, roll_size: int
+    ) -> torch.Tensor:
+        """
+        Feeds PastCovariatesTorchModel with input and output chunks of a PastCovariatesSequentialDataset to farecast
+        the next ``n`` target values per target variable.
+
+        Parameters:
+        ----------
+        n
+            prediction length
+        input_batch
+            (past_target, past_covariates, future_past_covariates)
+        roll_size
+            roll input arrays after every sequence by ``roll_size``. Initially, ``roll_size`` is equivalent to
+            ``self.output_chunk_length``
+        """
+        dim_component = 2
+        past_target, past_covariates, future_past_covariates = input_batch
+
+        n_targets = past_target.shape[dim_component]
+        n_past_covs = (
+            past_covariates.shape[dim_component] if past_covariates is not None else 0
+        )
+
+        input_past = torch.cat(
+            [ds for ds in [past_target, past_covariates] if ds is not None],
+            dim=dim_component,
+        )
+
+        out = self._produce_predict_output(input_past)[
+            :, self.first_prediction_index :, :
+        ]
+
+        batch_prediction = [out[:, :roll_size, :]]
+        prediction_length = roll_size
+
+        while prediction_length < n:
+            # we want the last prediction to end exactly at `n` into the future.
+            # this means we may have to truncate the previous prediction and step
+            # back the roll size for the last chunk
+            if prediction_length + self.output_chunk_length > n:
+                spillover_prediction_length = (
+                    prediction_length + self.output_chunk_length - n
+                )
+                roll_size -= spillover_prediction_length
+                prediction_length -= spillover_prediction_length
+                batch_prediction[-1] = batch_prediction[-1][:, :roll_size, :]
+
+            # ==========> PAST INPUT <==========
+            # roll over input series to contain latest target and covariate
+            input_past = torch.roll(input_past, -roll_size, 1)
+
+            # update target input to include next `roll_size` predictions
+            if self.input_chunk_length >= roll_size:
+                input_past[:, -roll_size:, :n_targets] = out[:, :roll_size, :]
+            else:
+                input_past[:, :, :n_targets] = out[:, -self.input_chunk_length :, :]
+
+            # set left and right boundaries for extracting future elements
+            if self.input_chunk_length >= roll_size:
+                left_past, right_past = prediction_length - roll_size, prediction_length
+            else:
+                left_past, right_past = (
+                    prediction_length - self.input_chunk_length,
+                    prediction_length,
+                )
+
+            # update past covariates to include next `roll_size` future past covariates elements
+            if n_past_covs and self.input_chunk_length >= roll_size:
+                input_past[
+                    :, -roll_size:, n_targets : n_targets + n_past_covs
+                ] = future_past_covariates[:, left_past:right_past, :]
+            elif n_past_covs:
+                input_past[
+                    :, :, n_targets : n_targets + n_past_covs
+                ] = future_past_covariates[:, left_past:right_past, :]
+
+            # take only last part of the output sequence where needed
+            out = self._produce_predict_output(input_past)[
+                :, self.first_prediction_index :, :
+            ]
+            batch_prediction.append(out)
+            prediction_length += self.output_chunk_length
+
+        # bring predictions into desired format and drop unnecessary values
+        batch_prediction = torch.cat(batch_prediction, dim=1)
+        batch_prediction = batch_prediction[:, :n, :]
+        return batch_prediction
+
+
+class PLFutureCovariatesModule(PLForecastingModule, ABC):
+    def _get_batch_prediction(
+        self, n: int, input_batch: Tuple, roll_size: int
+    ) -> torch.Tensor:
+        raise NotImplementedError("TBD: Darts doesn't contain such a model yet.")
+
+
+class PLDualCovariatesModule(PLForecastingModule, ABC):
+    def _get_batch_prediction(
+        self, n: int, input_batch: Tuple, roll_size: int
+    ) -> torch.Tensor:
+        raise NotImplementedError(
+            "TBD: The only DualCovariatesModel is an RNN with a specific implementation."
+        )
+
+
+class PLMixedCovariatesModule(PLForecastingModule, ABC):
+    def _produce_train_output(
+        self, input_batch: Tuple
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self(self._process_input_batch(input_batch))
+
+    def _process_input_batch(
+        self, input_batch
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Converts output of MixedCovariatesDataset (training dataset) into an input/past- and
+        output/future chunk.
+
+        Parameters
+        ----------
+        input_batch
+            ``(past_target, past_covariates, historic_future_covariates, future_covariates)``.
+
+        Returns
+        -------
+        tuple
+            ``(x_past, x_future)`` the input/past and output/future chunks.
+        """
+
+        (
+            past_target,
+            past_covariates,
+            historic_future_covariates,
+            future_covariates,
+        ) = input_batch
+        dim_variable = 2
+
+        # TODO: impelement static covariates
+        static_covariates = None
+
+        x_past = torch.cat(
+            [
+                tensor
+                for tensor in [
+                    past_target,
+                    past_covariates,
+                    historic_future_covariates,
+                    static_covariates,
+                ]
+                if tensor is not None
+            ],
+            dim=dim_variable,
+        )
+
+        x_future = None
+        if future_covariates is not None or static_covariates is not None:
+            x_future = torch.cat(
+                [
+                    tensor
+                    for tensor in [future_covariates, static_covariates]
+                    if tensor is not None
+                ],
+                dim=dim_variable,
+            )
+
+        return x_past, x_future
+
+    def _get_batch_prediction(
+        self, n: int, input_batch: Tuple, roll_size: int
+    ) -> torch.Tensor:
+        """
+        Feeds MixedCovariatesModel with input and output chunks of a MixedCovariatesSequentialDataset to farecast
+        the next ``n`` target values per target variable.
+
+        Parameters
+        ----------
+        n
+            prediction length
+        input_batch
+            (past_target, past_covariates, historic_future_covariates, future_covariates, future_past_covariates)
+        roll_size
+            roll input arrays after every sequence by ``roll_size``. Initially, ``roll_size`` is equivalent to
+            ``self.output_chunk_length``
+        """
+
+        dim_component = 2
+        (
+            past_target,
+            past_covariates,
+            historic_future_covariates,
+            future_covariates,
+            future_past_covariates,
+        ) = input_batch
+
+        n_targets = past_target.shape[dim_component]
+        n_past_covs = (
+            past_covariates.shape[dim_component] if past_covariates is not None else 0
+        )
+        n_future_covs = (
+            future_covariates.shape[dim_component]
+            if future_covariates is not None
+            else 0
+        )
+
+        input_past, input_future = self._process_input_batch(
+            (
+                past_target,
+                past_covariates,
+                historic_future_covariates,
+                future_covariates[:, :roll_size, :]
+                if future_covariates is not None
+                else None,
+            )
+        )
+
+        out = self._produce_predict_output(x=(input_past, input_future))[
+            :, self.first_prediction_index :, :
+        ]
+
+        batch_prediction = [out[:, :roll_size, :]]
+        prediction_length = roll_size
+
+        while prediction_length < n:
+            # we want the last prediction to end exactly at `n` into the future.
+            # this means we may have to truncate the previous prediction and step
+            # back the roll size for the last chunk
+            if prediction_length + self.output_chunk_length > n:
+                spillover_prediction_length = (
+                    prediction_length + self.output_chunk_length - n
+                )
+                roll_size -= spillover_prediction_length
+                prediction_length -= spillover_prediction_length
+                batch_prediction[-1] = batch_prediction[-1][:, :roll_size, :]
+
+            # ==========> PAST INPUT <==========
+            # roll over input series to contain latest target and covariate
+            input_past = torch.roll(input_past, -roll_size, 1)
+
+            # update target input to include next `roll_size` predictions
+            if self.input_chunk_length >= roll_size:
+                input_past[:, -roll_size:, :n_targets] = out[:, :roll_size, :]
+            else:
+                input_past[:, :, :n_targets] = out[:, -self.input_chunk_length :, :]
+
+            # set left and right boundaries for extracting future elements
+            if self.input_chunk_length >= roll_size:
+                left_past, right_past = prediction_length - roll_size, prediction_length
+            else:
+                left_past, right_past = (
+                    prediction_length - self.input_chunk_length,
+                    prediction_length,
+                )
+
+            # update past covariates to include next `roll_size` future past covariates elements
+            if n_past_covs and self.input_chunk_length >= roll_size:
+                input_past[
+                    :, -roll_size:, n_targets : n_targets + n_past_covs
+                ] = future_past_covariates[:, left_past:right_past, :]
+            elif n_past_covs:
+                input_past[
+                    :, :, n_targets : n_targets + n_past_covs
+                ] = future_past_covariates[:, left_past:right_past, :]
+
+            # update historic future covariates to include next `roll_size` future covariates elements
+            if n_future_covs and self.input_chunk_length >= roll_size:
+                input_past[
+                    :, -roll_size:, n_targets + n_past_covs :
+                ] = future_covariates[:, left_past:right_past, :]
+            elif n_future_covs:
+                input_past[:, :, n_targets + n_past_covs :] = future_covariates[
+                    :, left_past:right_past, :
+                ]
+
+            # ==========> FUTURE INPUT <==========
+            left_future, right_future = (
+                right_past,
+                right_past + self.output_chunk_length,
+            )
+            # update future covariates to include next `roll_size` future covariates elements
+            if n_future_covs:
+                input_future = future_covariates[:, left_future:right_future, :]
+
+            # take only last part of the output sequence where needed
+            out = self._produce_predict_output(x=(input_past, input_future))[
+                :, self.first_prediction_index :, :
+            ]
+
+            batch_prediction.append(out)
+            prediction_length += self.output_chunk_length
+
+        # bring predictions into desired format and drop unnecessary values
+        batch_prediction = torch.cat(batch_prediction, dim=1)
+        batch_prediction = batch_prediction[:, :n, :]
+        return batch_prediction
+
+
+class PLSplitCovariatesModule(PLForecastingModule, ABC):
+    def _get_batch_prediction(
+        self, n: int, input_batch: Tuple, roll_size: int
+    ) -> torch.Tensor:
+        raise NotImplementedError("TBD: Darts doesn't contain such a model yet.")
diff --git a/darts/models/forecasting/rnn_model.py b/darts/models/forecasting/rnn_model.py
index 3cdd7f53a6..8453f50da7 100644
--- a/darts/models/forecasting/rnn_model.py
+++ b/darts/models/forecasting/rnn_model.py
@@ -7,23 +7,18 @@
 
 import torch
 import torch.nn as nn
-from numpy.random import RandomState
 
 from darts.logging import get_logger, raise_if_not
-from darts.models.forecasting.torch_forecasting_model import (
-    DualCovariatesTorchModel,
-    TorchParametricProbabilisticForecastingModel,
-)
+from darts.models.forecasting.pl_forecasting_module import PLDualCovariatesModule
+from darts.models.forecasting.torch_forecasting_model import DualCovariatesTorchModel
 from darts.timeseries import TimeSeries
 from darts.utils.data import DualCovariatesShiftedDataset, TrainingDataset
-from darts.utils.likelihood_models import Likelihood
-from darts.utils.torch import random_method
 
 logger = get_logger(__name__)
 
 
 # TODO add batch norm
-class _RNNModule(nn.Module):
+class _RNNModule(PLDualCovariatesModule):
     def __init__(
         self,
         name: str,
@@ -33,6 +28,7 @@ def __init__(
         target_size: int,
         nr_params: int,
         dropout: float = 0.0,
+        **kwargs
     ):
 
         """PyTorch module implementing an RNN to be used in `RNNModel`.
@@ -58,6 +54,8 @@ def __init__(
             The number of parameters of the likelihood (or 1 if no likelihood is used).
         dropout
             The fraction of neurons that are dropped in all-but-last RNN layers.
+        **kwargs
+            all parameters required for :class:`darts.model.forecasting_models.PLForecastingModule` base class.
 
         Inputs
         ------
@@ -72,7 +70,11 @@ def __init__(
             However, this module always returns the whole Tensor.
         """
 
-        super().__init__()
+        # RNNModule doesn't really need input and output_chunk_length for PLModule
+        super().__init__(**kwargs)
+
+        # required for all modules -> saves hparams for checkpoints
+        self.save_hyperparameters()
 
         # Defining parameters
         self.target_size = target_size
@@ -103,19 +105,92 @@ def forward(self, x, h=None):
         # returns outputs for all inputs, only the last one is needed for prediction time
         return predictions, last_hidden_state
 
+    def _produce_train_output(self, input_batch: Tuple):
+        past_target, historic_future_covariates, future_covariates = input_batch
+        # For the RNN we concatenate the past_target with the future_covariates
+        # (they have the same length because we enforce a Shift dataset for RNNs)
+        model_input = (
+            torch.cat([past_target, future_covariates], dim=2)
+            if future_covariates is not None
+            else past_target
+        )
+        return self(model_input)[0]
+
+    def _produce_predict_output(self, x, last_hidden_state=None):
+        """overwrite parent classes `_produce_predict_output` method"""
+        output, hidden = self(x, last_hidden_state)
+        if self.likelihood:
+            return self.likelihood.sample(output), hidden
+        else:
+            return output.squeeze(dim=-1), hidden
+
+    def _get_batch_prediction(
+        self, n: int, input_batch: Tuple, roll_size: int
+    ) -> torch.Tensor:
+        """
+        This model is recurrent, so we have to write a specific way to obtain the time series forecasts of length n.
+        """
+        past_target, historic_future_covariates, future_covariates = input_batch
+
+        if historic_future_covariates is not None:
+            # RNNs need as inputs (target[t] and covariates[t+1]) so here we shift the covariates
+            all_covariates = torch.cat(
+                [historic_future_covariates[:, 1:, :], future_covariates], dim=1
+            )
+            cov_past, cov_future = (
+                all_covariates[:, : past_target.shape[1], :],
+                all_covariates[:, past_target.shape[1] :, :],
+            )
+            input_series = torch.cat([past_target, cov_past], dim=2)
+        else:
+            input_series = past_target
+            cov_future = None
+
+        batch_prediction = []
+        out, last_hidden_state = self._produce_predict_output(input_series)
+        batch_prediction.append(out[:, -1:, :])
+        prediction_length = 1
+
+        while prediction_length < n:
 
-class RNNModel(TorchParametricProbabilisticForecastingModel, DualCovariatesTorchModel):
-    @random_method
+            # create new input to model from last prediction and current covariates, if available
+            new_input = (
+                torch.cat(
+                    [
+                        out[:, -1:, :],
+                        cov_future[:, prediction_length - 1 : prediction_length, :],
+                    ],
+                    dim=2,
+                )
+                if cov_future is not None
+                else out[:, -1:, :]
+            )
+
+            # feed new input to model, including the last hidden state from the previous iteration
+            out, last_hidden_state = self._produce_predict_output(
+                new_input, last_hidden_state
+            )
+
+            # append prediction to batch prediction array, increase counter
+            batch_prediction.append(out[:, -1:, :])
+            prediction_length += 1
+
+        # bring predictions into desired format and drop unnecessary values
+        batch_prediction = torch.cat(batch_prediction, dim=1)
+        batch_prediction = batch_prediction[:, :n, :]
+
+        return batch_prediction
+
+
+class RNNModel(DualCovariatesTorchModel):
     def __init__(
         self,
+        input_chunk_length: int,
         model: Union[str, nn.Module] = "RNN",
-        input_chunk_length: int = 12,
         hidden_dim: int = 25,
         n_rnn_layers: int = 1,
         dropout: float = 0.0,
         training_length: int = 24,
-        likelihood: Optional[Likelihood] = None,
-        random_state: Optional[Union[int, RandomState]] = None,
         **kwargs
     ):
 
@@ -146,12 +221,12 @@ def __init__(
 
         Parameters
         ----------
+        input_chunk_length
+            Number of past time steps that are fed to the forecasting module at prediction time.
         model
             Either a string specifying the RNN module type ("RNN", "LSTM" or "GRU"),
             or a PyTorch module with the same specifications as
             `darts.models.rnn_model._RNNModule`.
-        input_chunk_length
-            Number of past time steps that are fed to the forecasting module at prediction time.
         hidden_dim
             Size for feature maps for each hidden RNN layer (:math:`h_n`).
         n_rnn_layers
@@ -163,64 +238,43 @@ def __init__(
             training. Generally speaking, `training_length` should have a higher value than `input_chunk_length`
             because otherwise during training the RNN is never run for as many iterations as it will during
             training. For more information on this parameter, please see `darts.utils.data.ShiftedDataset`
-        likelihood
-            Optionally, the likelihood model to be used for probabilistic forecasts.
-            If no likelihood model is provided, forecasts will be deterministic.
-        random_state
-            Control the randomness of the weights initialization. Check this
-            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
-
-        batch_size
-            Number of time series (input and output sequences) used in each training pass.
-        n_epochs
-            Number of epochs over which to train the model.
-        add_encoders
-            A large number of past and future covariates can be automatically generated with `add_encoders`.
-            This can be done by adding mutliple pre-defined index encoders and/or custom user-made functions that
-            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
-            transform the generated covariates. This happens all under one hood and only needs to be specified at
-            model creation.
-            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
-            `add_encoders`. An example showing some of `add_encoders` features:
-
-            .. highlight:: python
-            .. code-block:: python
+        **kwargs
+            Optional arguments to initialize the pytorch_lightning.Module, pytorch_lightning.Trainer, and
+            Darts' :class:`TorchForecastingModel`.
 
-                add_encoders={
-                    'cyclic': {'future': ['month']},
-                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
-                    'position': {'past': ['absolute'], 'future': ['relative']},
-                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
-                    'transformer': Scaler()
-                }
-            ..
+        loss_fn
+            PyTorch loss function used for training.
+            This parameter will be ignored for probabilistic models if the ``likelihood`` parameter is specified.
+            Default: ``torch.nn.MSELoss()``.
+        likelihood
+            The likelihood model to be used for probabilistic forecasts.
         optimizer_cls
-            The PyTorch optimizer class to be used (default: `torch.optim.Adam`).
+            The PyTorch optimizer class to be used (default: ``torch.optim.Adam``).
         optimizer_kwargs
             Optionally, some keyword arguments for the PyTorch optimizer (e.g., ``{'lr': 1e-3}``
-            for specifying a learning rate). Otherwise the default values of the selected `optimizer_cls`
+            for specifying a learning rate). Otherwise the default values of the selected ``optimizer_cls``
             will be used.
         lr_scheduler_cls
-            Optionally, the PyTorch learning rate scheduler class to be used. Specifying `None` corresponds
+            Optionally, the PyTorch learning rate scheduler class to be used. Specifying ``None`` corresponds
             to using a constant learning rate.
         lr_scheduler_kwargs
-            Optionally, some keyword arguments for the PyTorch optimizer.
-        loss_fn
-            PyTorch loss function used for training.
-            This parameter will be ignored for probabilistic models if the `likelihood` parameter is specified.
-            Default: ``torch.nn.MSELoss()``.
+            Optionally, some keyword arguments for the PyTorch learning rate scheduler.
+        batch_size
+            Number of time series (input and output sequences) used in each training pass.
+        n_epochs
+            Number of epochs over which to train the model.
         model_name
             Name of the model. Used for creating checkpoints and saving tensorboard data. If not specified,
-            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part of
-            the name is formatted with the local date and time, while PID is the processed ID (preventing models spawned
-            at the same time by different processes to share the same model_name). E.g.,
+            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part
+            of the name is formatted with the local date and time, while PID is the processed ID (preventing models
+            spawned at the same time by different processes to share the same model_name). E.g.,
             ``"2021-06-14_09:53:32_torch_model_run_44607"``.
         work_dir
             Path of the working directory, where to save checkpoints and Tensorboard summaries.
             (default: current working directory).
         log_tensorboard
             If set, use Tensorboard to log the different parameters. The logs will be located in:
-            `[work_dir]/.darts/runs/`.
+            ``"{work_dir}/darts_logs/{model_name}/logs/"``.
         nr_epochs_val_period
             Number of epochs to wait before evaluating the validation loss (if a validation
             ``TimeSeries`` is passed to the :func:`fit()` method).
@@ -228,17 +282,83 @@ def __init__(
             Optionally, a string indicating the torch device to use. (default: "cuda:0" if a GPU
             is available, otherwise "cpu")
         force_reset
-            If set to `True`, any previously-existing model with the same name will be reset (all checkpoints will
+            If set to ``True``, any previously-existing model with the same name will be reset (all checkpoints will
             be discarded).
         save_checkpoints
             Whether or not to automatically save the untrained model and checkpoints from training.
-            If set to `False`, the model can still be manually saved using :func:`save_model()`
-            and loaded using :func:`load_model()`.
+            To load the model from checkpoint, call :func:`MyModelClass.load_from_checkpoint()`, where
+            :class:`MyModelClass` is the :class:`TorchForecastingModel` class that was used (such as :class:`TFTModel`,
+            :class:`NBEATSModel`, etc.). If set to ``False``, the model can still be manually saved using
+            :func:`save_model()` and loaded using :func:`load_model()`.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
+        random_state
+            Control the randomness of the weights initialization. Check this
+            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
+        pl_trainer_kwargs
+            By default :class:`TorchForecastingModel` creates a PyTorch Lightning Trainer with several useful presets
+            that performs the training, validation and prediction processes. These presets include automatic
+            checkpointing, tensorboard logging, setting the torch device and more.
+            With ``pl_trainer_kwargs`` you can add additional kwargs to instantiate the PyTorch Lightning trainer
+            object. Check the `PL Trainer documentation
+            <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ for more information about the
+            supported kwargs.
+            With parameter ``"callbacks"`` you can add custom or PyTorch-Lightning built-in callbacks to Darts'
+            :class:`TorchForecastingModel`. Below is an example for adding EarlyStopping to the training process.
+            The model will stop training early if the validation loss `val_loss` does not improve beyond
+            specifications. For more information on callbacks, visit:
+            `PyTorch Lightning Callbacks
+            <https://pytorch-lightning.readthedocs.io/en/stable/extensions/callbacks.html>`_
+
+            .. highlight:: python
+            .. code-block:: python
+
+                from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+                # stop training when validation loss does not decrease more than 0.05 (`min_delta`) over
+                # a period of 5 epochs (`patience`)
+                my_stopper = EarlyStopping(
+                    monitor="val_loss",
+                    patience=5,
+                    min_delta=0.05,
+                    mode='min',
+                )
+
+                pl_trainer_kwargs={"callbacks": [my_stopper]}
+            ..
+
+            Note that you can also use a custom PyTorch Lightning Trainer for training and prediction with optional
+            parameter ``trainer`` in :func:`fit()` and :func:`predict()`.
+        show_warnings
+            whether to show warnings raised from PyTorch Lightning. Useful to detect potential issues of
+            your forecasting use case.
         """
+        # create copy of model parameters
+        model_kwargs = {key: val for key, val in self.model_params.items()}
+        model_kwargs["output_chunk_length"] = 1
 
-        kwargs["input_chunk_length"] = input_chunk_length
-        kwargs["output_chunk_length"] = 1
-        super().__init__(likelihood=likelihood, **kwargs)
+        super().__init__(**self._extract_torch_model_params(**model_kwargs))
+
+        # extract pytorch lightning module kwargs
+        self.pl_module_params = self._extract_pl_module_params(**model_kwargs)
 
         # check we got right model type specified:
         if model not in ["RNN", "LSTM", "GRU"]:
@@ -275,6 +395,7 @@ def _create_model(self, train_sample: Tuple[torch.Tensor]) -> torch.nn.Module:
                 hidden_dim=self.hidden_dim,
                 dropout=self.dropout,
                 num_layers=self.n_rnn_layers,
+                **self.pl_module_params,
             )
         else:
             model = self.rnn_type_or_module(
@@ -285,6 +406,7 @@ def _create_model(self, train_sample: Tuple[torch.Tensor]) -> torch.nn.Module:
                 hidden_dim=self.hidden_dim,
                 dropout=self.dropout,
                 num_layers=self.n_rnn_layers,
+                **self.pl_module_params,
             )
         return model
 
@@ -313,79 +435,3 @@ def _verify_train_dataset_type(self, train_dataset: TrainingDataset):
             train_dataset.ds_past.shift == 1,
             "RNNModel requires a shifted training dataset with shift=1.",
         )
-
-    def _produce_train_output(self, input_batch: Tuple):
-        past_target, historic_future_covariates, future_covariates = input_batch
-        # For the RNN we concatenate the past_target with the future_covariates
-        # (they have the same length because we enforce a Shift dataset for RNNs)
-        model_input = (
-            torch.cat([past_target, future_covariates], dim=2)
-            if future_covariates is not None
-            else past_target
-        )
-        return self.model(model_input)[0]
-
-    @random_method
-    def _produce_predict_output(self, x, last_hidden_state=None):
-        output, hidden = self.model(x, last_hidden_state)
-        if self.likelihood:
-            return self.likelihood.sample(output), hidden
-        else:
-            return output.squeeze(dim=-1), hidden
-
-    def _get_batch_prediction(
-        self, n: int, input_batch: Tuple, roll_size: int
-    ) -> torch.Tensor:
-        """
-        This model is recurrent, so we have to write a specific way to obtain the time series forecasts of length n.
-        """
-        past_target, historic_future_covariates, future_covariates = input_batch
-
-        if historic_future_covariates is not None:
-            # RNNs need as inputs (target[t] and covariates[t+1]) so here we shift the covariates
-            all_covariates = torch.cat(
-                [historic_future_covariates[:, 1:, :], future_covariates], dim=1
-            )
-            cov_past, cov_future = (
-                all_covariates[:, : past_target.shape[1], :],
-                all_covariates[:, past_target.shape[1] :, :],
-            )
-            input_series = torch.cat([past_target, cov_past], dim=2)
-        else:
-            input_series = past_target
-            cov_future = None
-
-        batch_prediction = []
-        out, last_hidden_state = self._produce_predict_output(input_series)
-        batch_prediction.append(out[:, -1:, :])
-        prediction_length = 1
-
-        while prediction_length < n:
-
-            # create new input to model from last prediction and current covariates, if available
-            new_input = (
-                torch.cat(
-                    [
-                        out[:, -1:, :],
-                        cov_future[:, prediction_length - 1 : prediction_length, :],
-                    ],
-                    dim=2,
-                )
-                if cov_future is not None
-                else out[:, -1:, :]
-            )
-
-            # feed new input to model, including the last hidden state from the previous iteration
-            out, last_hidden_state = self._produce_predict_output(
-                new_input, last_hidden_state
-            )
-
-            # append prediction to batch prediction array, increase counter
-            batch_prediction.append(out[:, -1:, :])
-            prediction_length += 1
-
-        # bring predictions into desired format and drop unnecessary values
-        batch_prediction = torch.cat(batch_prediction, dim=1)
-        batch_prediction = batch_prediction[:, :n, :]
-
-        return batch_prediction
diff --git a/darts/models/forecasting/tcn_model.py b/darts/models/forecasting/tcn_model.py
index 0ae6b7622c..7d28f7ba18 100644
--- a/darts/models/forecasting/tcn_model.py
+++ b/darts/models/forecasting/tcn_model.py
@@ -4,22 +4,17 @@
 """
 
 import math
-from typing import Optional, Sequence, Tuple, Union
+from typing import Optional, Sequence, Tuple
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from numpy.random import RandomState
 
 from darts.logging import get_logger, raise_if_not
-from darts.models.forecasting.torch_forecasting_model import (
-    PastCovariatesTorchModel,
-    TorchParametricProbabilisticForecastingModel,
-)
+from darts.models.forecasting.pl_forecasting_module import PLPastCovariatesModule
+from darts.models.forecasting.torch_forecasting_model import PastCovariatesTorchModel
 from darts.timeseries import TimeSeries
 from darts.utils.data import PastCovariatesShiftedDataset
-from darts.utils.likelihood_models import Likelihood
-from darts.utils.torch import random_method
 
 logger = get_logger(__name__)
 
@@ -129,11 +124,10 @@ def forward(self, x):
         return x
 
 
-class _TCNModule(nn.Module):
+class _TCNModule(PLPastCovariatesModule):
     def __init__(
         self,
         input_size: int,
-        input_chunk_length: int,
         kernel_size: int,
         num_filters: int,
         num_layers: Optional[int],
@@ -143,6 +137,7 @@ def __init__(
         nr_params: int,
         target_length: int,
         dropout: float,
+        **kwargs
     ):
 
         """PyTorch module implementing a dilated TCN module used in `TCNModel`.
@@ -156,8 +151,6 @@ def __init__(
             The dimensionality of the output time series.
         nr_params
             The number of parameters of the likelihood (or 1 if no likelihood is used).
-        input_chunk_length
-            The length of the input time series.
         target_length
             Number of time steps the torch module will predict into the future at once.
         kernel_size
@@ -172,6 +165,8 @@ def __init__(
             The base of the exponent that will determine the dilation on every level.
         dropout
             The dropout rate for every convolutional layer.
+        **kwargs
+            all parameters required for :class:`darts.model.forecasting_models.PLForecastingModule` base class.
 
         Inputs
         ------
@@ -186,11 +181,13 @@ def __init__(
             leading up to the first prediction, all in chronological order.
         """
 
-        super().__init__()
+        super().__init__(**kwargs)
+
+        # required for all modules -> saves hparams for checkpoints
+        self.save_hyperparameters()
 
         # Defining parameters
         self.input_size = input_size
-        self.input_chunk_length = input_chunk_length
         self.n_filters = num_filters
         self.kernel_size = kernel_size
         self.target_length = target_length
@@ -203,7 +200,7 @@ def __init__(
         if num_layers is None and dilation_base > 1:
             num_layers = math.ceil(
                 math.log(
-                    (input_chunk_length - 1)
+                    (self.input_chunk_length - 1)
                     * (dilation_base - 1)
                     / (kernel_size - 1)
                     / 2
@@ -213,7 +210,9 @@ def __init__(
             )
             logger.info("Number of layers chosen: " + str(num_layers))
         elif num_layers is None:
-            num_layers = math.ceil((input_chunk_length - 1) / (kernel_size - 1) / 2)
+            num_layers = math.ceil(
+                (self.input_chunk_length - 1) / (kernel_size - 1) / 2
+            )
             logger.info("Number of layers chosen: " + str(num_layers))
         self.num_layers = num_layers
 
@@ -249,9 +248,12 @@ def forward(self, x):
 
         return x
 
+    @property
+    def first_prediction_index(self) -> int:
+        return -self.output_chunk_length
+
 
-class TCNModel(TorchParametricProbabilisticForecastingModel, PastCovariatesTorchModel):
-    @random_method
+class TCNModel(PastCovariatesTorchModel):
     def __init__(
         self,
         input_chunk_length: int,
@@ -262,8 +264,6 @@ def __init__(
         dilation_base: int = 2,
         weight_norm: bool = False,
         dropout: float = 0.2,
-        likelihood: Optional[Likelihood] = None,
-        random_state: Optional[Union[int, RandomState]] = None,
         **kwargs
     ):
 
@@ -291,64 +291,43 @@ def __init__(
             The number of convolutional layers.
         dropout
             The dropout rate for every convolutional layer.
-        likelihood
-            Optionally, the likelihood model to be used for probabilistic forecasts.
-            If no likelihood model is provided, forecasts will be deterministic.
-        random_state
-            Control the randomness of the weights initialization. Check this
-            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
-
-        batch_size
-            Number of time series (input and output sequences) used in each training pass.
-        n_epochs
-            Number of epochs over which to train the model.
-        add_encoders
-            A large number of past and future covariates can be automatically generated with `add_encoders`.
-            This can be done by adding mutliple pre-defined index encoders and/or custom user-made functions that
-            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
-            transform the generated covariates. This happens all under one hood and only needs to be specified at
-            model creation.
-            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
-            `add_encoders`. An example showing some of `add_encoders` features:
-
-            .. highlight:: python
-            .. code-block:: python
+        **kwargs
+            Optional arguments to initialize the pytorch_lightning.Module, pytorch_lightning.Trainer, and
+            Darts' :class:`TorchForecastingModel`.
 
-                add_encoders={
-                    'cyclic': {'future': ['month']},
-                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
-                    'position': {'past': ['absolute'], 'future': ['relative']},
-                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
-                    'transformer': Scaler()
-                }
-            ..
+        loss_fn
+            PyTorch loss function used for training.
+            This parameter will be ignored for probabilistic models if the ``likelihood`` parameter is specified.
+            Default: ``torch.nn.MSELoss()``.
+        likelihood
+            The likelihood model to be used for probabilistic forecasts.
         optimizer_cls
-            The PyTorch optimizer class to be used (default: `torch.optim.Adam`).
+            The PyTorch optimizer class to be used (default: ``torch.optim.Adam``).
         optimizer_kwargs
             Optionally, some keyword arguments for the PyTorch optimizer (e.g., ``{'lr': 1e-3}``
-            for specifying a learning rate). Otherwise the default values of the selected `optimizer_cls`
+            for specifying a learning rate). Otherwise the default values of the selected ``optimizer_cls``
             will be used.
         lr_scheduler_cls
-            Optionally, the PyTorch learning rate scheduler class to be used. Specifying `None` corresponds
+            Optionally, the PyTorch learning rate scheduler class to be used. Specifying ``None`` corresponds
             to using a constant learning rate.
         lr_scheduler_kwargs
-            Optionally, some keyword arguments for the PyTorch optimizer.
-        loss_fn
-            PyTorch loss function used for training.
-            This parameter will be ignored for probabilistic models if the `likelihood` parameter is specified.
-            Default: ``torch.nn.MSELoss()``.
+            Optionally, some keyword arguments for the PyTorch learning rate scheduler.
+        batch_size
+            Number of time series (input and output sequences) used in each training pass.
+        n_epochs
+            Number of epochs over which to train the model.
         model_name
             Name of the model. Used for creating checkpoints and saving tensorboard data. If not specified,
-            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part of
-            the name is formatted with the local date and time, while PID is the processed ID (preventing models spawned
-            at the same time by different processes to share the same model_name). E.g.,
+            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part
+            of the name is formatted with the local date and time, while PID is the processed ID (preventing models
+            spawned at the same time by different processes to share the same model_name). E.g.,
             ``"2021-06-14_09:53:32_torch_model_run_44607"``.
         work_dir
             Path of the working directory, where to save checkpoints and Tensorboard summaries.
             (default: current working directory).
         log_tensorboard
             If set, use Tensorboard to log the different parameters. The logs will be located in:
-            `[work_dir]/.darts/runs/`.
+            ``"{work_dir}/darts_logs/{model_name}/logs/"``.
         nr_epochs_val_period
             Number of epochs to wait before evaluating the validation loss (if a validation
             ``TimeSeries`` is passed to the :func:`fit()` method).
@@ -356,12 +335,74 @@ def __init__(
             Optionally, a string indicating the torch device to use. (default: "cuda:0" if a GPU
             is available, otherwise "cpu")
         force_reset
-            If set to `True`, any previously-existing model with the same name will be reset (all checkpoints will
+            If set to ``True``, any previously-existing model with the same name will be reset (all checkpoints will
             be discarded).
         save_checkpoints
             Whether or not to automatically save the untrained model and checkpoints from training.
-            If set to `False`, the model can still be manually saved using :func:`save_model()`
-            and loaded using :func:`load_model()`.
+            To load the model from checkpoint, call :func:`MyModelClass.load_from_checkpoint()`, where
+            :class:`MyModelClass` is the :class:`TorchForecastingModel` class that was used (such as :class:`TFTModel`,
+            :class:`NBEATSModel`, etc.). If set to ``False``, the model can still be manually saved using
+            :func:`save_model()` and loaded using :func:`load_model()`.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
+        random_state
+            Control the randomness of the weights initialization. Check this
+            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
+        pl_trainer_kwargs
+            By default :class:`TorchForecastingModel` creates a PyTorch Lightning Trainer with several useful presets
+            that performs the training, validation and prediction processes. These presets include automatic
+            checkpointing, tensorboard logging, setting the torch device and more.
+            With ``pl_trainer_kwargs`` you can add additional kwargs to instantiate the PyTorch Lightning trainer
+            object. Check the `PL Trainer documentation
+            <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ for more information about the
+            supported kwargs.
+            With parameter ``"callbacks"`` you can add custom or PyTorch-Lightning built-in callbacks to Darts'
+            :class:`TorchForecastingModel`. Below is an example for adding EarlyStopping to the training process.
+            The model will stop training early if the validation loss `val_loss` does not improve beyond
+            specifications. For more information on callbacks, visit:
+            `PyTorch Lightning Callbacks
+            <https://pytorch-lightning.readthedocs.io/en/stable/extensions/callbacks.html>`_
+
+            .. highlight:: python
+            .. code-block:: python
+
+                from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+                # stop training when validation loss does not decrease more than 0.05 (`min_delta`) over
+                # a period of 5 epochs (`patience`)
+                my_stopper = EarlyStopping(
+                    monitor="val_loss",
+                    patience=5,
+                    min_delta=0.05,
+                    mode='min',
+                )
+
+                pl_trainer_kwargs={"callbacks": [my_stopper]}
+            ..
+
+            Note that you can also use a custom PyTorch Lightning Trainer for training and prediction with optional
+            parameter ``trainer`` in :func:`fit()` and :func:`predict()`.
+        show_warnings
+            whether to show warnings raised from PyTorch Lightning. Useful to detect potential issues of
+            your forecasting use case.
 
         References
         ----------
@@ -379,13 +420,11 @@ def __init__(
             logger,
         )
 
-        kwargs["input_chunk_length"] = input_chunk_length
-        kwargs["output_chunk_length"] = output_chunk_length
+        super().__init__(**self._extract_torch_model_params(**self.model_params))
 
-        super().__init__(likelihood=likelihood, **kwargs)
+        # extract pytorch lightning module kwargs
+        self.pl_module_params = self._extract_pl_module_params(**self.model_params)
 
-        self.input_chunk_length = input_chunk_length
-        self.output_chunk_length = output_chunk_length
         self.kernel_size = kernel_size
         self.num_filters = num_filters
         self.num_layers = num_layers
@@ -403,7 +442,6 @@ def _create_model(self, train_sample: Tuple[torch.Tensor]) -> torch.nn.Module:
 
         return _TCNModule(
             input_size=input_dim,
-            input_chunk_length=self.input_chunk_length,
             target_size=output_dim,
             nr_params=nr_params,
             kernel_size=self.kernel_size,
@@ -413,6 +451,7 @@ def _create_model(self, train_sample: Tuple[torch.Tensor]) -> torch.nn.Module:
             target_length=self.output_chunk_length,
             dropout=self.dropout,
             weight_norm=self.weight_norm,
+            **self.pl_module_params,
         )
 
     def _build_train_dataset(
@@ -430,15 +469,3 @@ def _build_train_dataset(
             shift=self.output_chunk_length,
             max_samples_per_ts=max_samples_per_ts,
         )
-
-    @random_method
-    def _produce_predict_output(self, x):
-        if self.likelihood:
-            output = self.model(x)
-            return self.likelihood.sample(output)
-        else:
-            return self.model(x).squeeze(dim=-1)
-
-    @property
-    def first_prediction_index(self) -> int:
-        return -self.output_chunk_length
diff --git a/darts/models/forecasting/tft_model.py b/darts/models/forecasting/tft_model.py
index 4f5f86de0f..264b2efd9d 100644
--- a/darts/models/forecasting/tft_model.py
+++ b/darts/models/forecasting/tft_model.py
@@ -7,22 +7,19 @@
 
 import numpy as np
 import torch
-from numpy.random import RandomState
 from torch import nn
 from torch.nn import LSTM as _LSTM
 
 from darts import TimeSeries
 from darts.logging import get_logger, raise_if, raise_if_not
+from darts.models.forecasting.pl_forecasting_module import PLMixedCovariatesModule
 from darts.models.forecasting.tft_submodels import (
     _GateAddNorm,
     _GatedResidualNetwork,
     _InterpretableMultiHeadAttention,
     _VariableSelectionNetwork,
 )
-from darts.models.forecasting.torch_forecasting_model import (
-    MixedCovariatesTorchModel,
-    TorchParametricProbabilisticForecastingModel,
-)
+from darts.models.forecasting.torch_forecasting_model import MixedCovariatesTorchModel
 from darts.utils.data import (
     MixedCovariatesInferenceDataset,
     MixedCovariatesSequentialDataset,
@@ -30,7 +27,6 @@
     TrainingDataset,
 )
 from darts.utils.likelihood_models import Likelihood, QuantileRegression
-from darts.utils.torch import random_method
 
 logger = get_logger(__name__)
 
@@ -39,12 +35,10 @@
 ]
 
 
-class _TFTModule(nn.Module):
+class _TFTModule(PLMixedCovariatesModule):
     def __init__(
         self,
         output_dim: Tuple[int, int],
-        input_chunk_length: int,
-        output_chunk_length: int,
         variables_meta: Dict[str, Dict[str, List[str]]],
         hidden_size: Union[int, List[int]] = 16,
         lstm_layers: int = 1,
@@ -53,7 +47,7 @@ def __init__(
         hidden_continuous_size: int = 8,
         dropout: float = 0.1,
         add_relative_index: bool = False,
-        likelihood: Optional[Likelihood] = None,
+        **kwargs,
     ):
 
         """PyTorch module implementing the TFT architecture from `this paper <https://arxiv.org/pdf/1912.09363.pdf>`_
@@ -64,10 +58,6 @@ def __init__(
         ----------
         output_dim : Tuple[int, int]
             shape of output given by (n_targets, loss_size). (loss_size corresponds to nr_params in other models).
-        input_chunk_length : int
-            encoder length; number of past time steps that are fed to the forecasting module at prediction time.
-        output_chunk_length : int
-            decoder length; number of future time steps that are fed to the forecasting module at prediction time.
         variables_meta : Dict[str, Dict[str, List[str]]]
             dict containing variable enocder, decoder variable names for mapping tensors in `_TFTModule.forward()`
         hidden_size : int
@@ -92,13 +82,16 @@ def __init__(
         likelihood
             The likelihood model to be used for probabilistic forecasts. By default the TFT uses
             a ``QuantileRegression`` likelihood.
+        **kwargs
+            all parameters required for :class:`darts.model.forecasting_models.PLForecastingModule` base class.
         """
 
-        super().__init__()
+        super().__init__(**kwargs)
+
+        # required for all modules -> saves hparams for checkpoints
+        self.save_hyperparameters()
 
         self.n_targets, self.loss_size = output_dim
-        self.input_chunk_length = input_chunk_length
-        self.output_chunk_length = output_chunk_length
         self.variables_meta = variables_meta
         self.hidden_size = hidden_size
         self.hidden_continuous_size = hidden_continuous_size
@@ -106,7 +99,6 @@ def __init__(
         self.num_attention_heads = num_attention_heads
         self.full_attention = full_attention
         self.dropout = dropout
-        self.likelihood = likelihood
         self.add_relative_index = add_relative_index
 
         # initialize last batch size to check if new mask needs to be generated
@@ -342,15 +334,27 @@ def get_attention_mask_future(
         )
         return mask
 
-    def forward(self, x) -> Dict[str, torch.Tensor]:
-        """
-        input dimensions: (n_samples, n_time_steps, n_variables)
-        """
+    def forward(self, x: Tuple[torch.Tensor, Optional[torch.Tensor]]) -> torch.Tensor:
+        """TFT model forward pass.
 
+        Parameters
+        ----------
+        x
+            comes as tuple `(x_past, x_future)` where `x_past` is the input/past chunk and `x_future`
+            is the output/future chunk. Input dimensions are `(n_samples, n_time_steps, n_variables)`
+
+        Returns
+        -------
+        torch.Tensor
+            the output tensor
+        """
+        x_cont_past, x_cont_future = x
         dim_samples, dim_time, dim_variable = 0, 1, 2
-        past_target, past_covariates, historic_future_covariates, future_covariates = x
 
-        batch_size = past_target.shape[dim_samples]
+        # TODO: impelement static covariates
+        static_covariates = None
+
+        batch_size = x_cont_past.shape[dim_samples]
         encoder_length = self.input_chunk_length
         decoder_length = self.output_chunk_length
         time_steps = encoder_length + decoder_length
@@ -361,70 +365,44 @@ def forward(self, x) -> Dict[str, torch.Tensor]:
                 self.attention_mask = self.get_attention_mask_full(
                     time_steps=time_steps,
                     batch_size=batch_size,
-                    dtype=past_target.dtype,
-                    device=past_target.device,
+                    dtype=x_cont_past.dtype,
+                    device=self.device,
                 )
             else:
                 self.attention_mask = self.get_attention_mask_future(
                     encoder_length=encoder_length,
                     decoder_length=decoder_length,
                     batch_size=batch_size,
-                    device=past_target.device,
+                    device=self.device,
                 )
             if self.add_relative_index:
                 self.relative_index = self.get_relative_index(
                     encoder_length=encoder_length,
                     decoder_length=decoder_length,
                     batch_size=batch_size,
-                    device=past_target.device,
-                    dtype=past_target.dtype,
+                    device=self.device,
+                    dtype=x_cont_past.dtype,
                 )
 
             self.batch_size_last = batch_size
 
         if self.add_relative_index:
-            historic_future_covariates = torch.cat(
+            x_cont_past = torch.cat(
                 [
                     ts[:, :encoder_length, :]
-                    for ts in [historic_future_covariates, self.relative_index]
+                    for ts in [x_cont_past, self.relative_index]
                     if ts is not None
                 ],
                 dim=dim_variable,
             )
-            future_covariates = torch.cat(
+            x_cont_future = torch.cat(
                 [
                     ts[:, -decoder_length:, :]
-                    for ts in [future_covariates, self.relative_index]
+                    for ts in [x_cont_future, self.relative_index]
                     if ts is not None
                 ],
                 dim=dim_variable,
             )
-        # TODO: impelement static covariates
-        static_covariates = None
-
-        # data is of size (batch_size, input_length, input_size)
-        x_cont_past = torch.cat(
-            [
-                tensor
-                for tensor in [
-                    past_target,
-                    past_covariates,
-                    historic_future_covariates,
-                    static_covariates,
-                ]
-                if tensor is not None
-            ],
-            dim=dim_variable,
-        )
-
-        x_cont_future = torch.cat(
-            [
-                tensor
-                for tensor in [future_covariates, static_covariates]
-                if tensor is not None
-            ],
-            dim=dim_variable,
-        )
 
         input_vectors_past = {
             name: x_cont_past[..., idx].unsqueeze(-1)
@@ -444,22 +422,19 @@ def forward(self, x) -> Dict[str, torch.Tensor]:
             raise NotImplementedError("Static covariates have yet to be defined")
         else:
             static_embedding = torch.zeros(
-                (past_target.shape[0], self.hidden_size),
-                dtype=past_target.dtype,
-                device=past_target.device,
+                (x_cont_past.shape[0], self.hidden_size),
+                dtype=x_cont_past.dtype,
+                device=self.device,
             )
 
             # # TODO: implement below when static covariates are supported
             # # this is only to interpret the output
             # static_covariate_var = torch.zeros(
-            #     (past_target.shape[0], 0),
-            #     dtype=past_target.dtype,
-            #     device=past_target.device,
+            #     (x_cont_past.shape[0], 0),
+            #     dtype=x_cont_past.dtype,
+            #     device=x_cont_past.device,
             # )
 
-        if future_covariates is None and static_covariates is None:
-            raise NotImplementedError("make zero tensor if future covariates is None")
-
         static_context_expanded = self.expand_static_context(
             context=self.static_context_grn(static_embedding), time_steps=time_steps
         )
@@ -544,7 +519,6 @@ def forward(self, x) -> Dict[str, torch.Tensor]:
         )
 
         # generate output for n_targets and loss_size elements for loss evaluation
-
         out = self.output_layer(out[:, encoder_length:] if self.full_attention else out)
         out = out.view(
             batch_size, self.output_chunk_length, self.n_targets, self.loss_size
@@ -564,12 +538,11 @@ def forward(self, x) -> Dict[str, torch.Tensor]:
         return out
 
 
-class TFTModel(TorchParametricProbabilisticForecastingModel, MixedCovariatesTorchModel):
-    @random_method
+class TFTModel(MixedCovariatesTorchModel):
     def __init__(
         self,
-        input_chunk_length: int = 12,
-        output_chunk_length: int = 1,
+        input_chunk_length: int,
+        output_chunk_length: int,
         hidden_size: Union[int, List[int]] = 16,
         lstm_layers: int = 1,
         num_attention_heads: int = 4,
@@ -579,7 +552,6 @@ def __init__(
         add_relative_index: bool = False,
         loss_fn: Optional[nn.Module] = None,
         likelihood: Optional[Likelihood] = None,
-        random_state: Optional[Union[int, RandomState]] = None,
         **kwargs,
     ):
         """Temporal Fusion Transformers (TFT) for Interpretable Time Series Forecasting.
@@ -589,15 +561,16 @@ def __init__(
         The internal sub models are adopted from `pytorch-forecasting's TemporalFusionTransformer
         <https://pytorch-forecasting.readthedocs.io/en/latest/models.html>`_ implementation.
 
-        This model supports mixed covariates (includes past covariates known for `input_chunk_length`
-        points before prediction time and future covariates known for `output_chunk_length` after prediction time).
+        This model supports mixed covariates (includes past covariates known for ``input_chunk_length``
+        points before prediction time and future covariates known for ``output_chunk_length`` after prediction time).
 
-        The TFT applies multi-head attention queries on future inputs from mandatory `future_covariates`.
-        Specifying future encoders with `add_encoders` (read below) can automatically generate future covariates
-        and allows to use the model without having to pass any `future_covariates` to `fit()` and `predict()`.
+        The TFT applies multi-head attention queries on future inputs from mandatory ``future_covariates``.
+        Specifying future encoders with ``add_encoders`` (read below) can automatically generate future covariates
+        and allows to use the model without having to pass any ``future_covariates`` to :func:`fit()` and
+        :func:`predict()`.
 
         By default, this model uses the ``QuantileRegression`` likelihood, which means that its forecasts are
-        probabilistic; it is recommended to call :func`predict()` with `num_samples >> 1` to get meaningful results.
+        probabilistic; it is recommended to call :func`predict()` with ``num_samples >> 1`` to get meaningful results.
 
         Parameters
         ----------
@@ -613,17 +586,17 @@ def __init__(
         num_attention_heads : int
             Number of attention heads (4 is a good default)
         full_attention : bool
-            If `True`, applies multi-head attention query on past (encoder) and future (decoder) parts. Otherwise,
-            only queries on future part. Defaults to `False`.
+            If ``True``, applies multi-head attention query on past (encoder) and future (decoder) parts. Otherwise,
+            only queries on future part. Defaults to ``False``.
         dropout : float
             Fraction of neurons afected by Dropout.
         hidden_continuous_size : int
             Default for hidden size for processing continuous variables
         add_relative_index : bool
-            Whether to add positional values to future covariates. Defaults to `False`.
-            This allows to use the TFTModel without having to pass future_covariates to `fit()` and `train()`.
-            It gives a value to the position of each step from input and output chunk relative to the prediction
-            point. The values are normalized with `input_chunk_length`.
+            Whether to add positional values to future covariates. Defaults to ``False``.
+            This allows to use the TFTModel without having to pass future_covariates to :fun:`fit()` and
+            :func:`train()`. It gives a value to the position of each step from input and output chunk relative
+            to the prediction point. The values are normalized with ``input_chunk_length``.
         loss_fn : nn.Module
             PyTorch loss function used for training. By default the TFT model is probabilistic and uses a ``likelihood``
             instead (``QuantileRegression``). To make the model deterministic, you can set the ``likelihood`` to None
@@ -631,59 +604,37 @@ def __init__(
         likelihood
             The likelihood model to be used for probabilistic forecasts. By default the TFT uses
             a ``QuantileRegression`` likelihood.
-        random_state
-            Control the randomness of the weights initialization. Check this
-            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
         **kwargs
-            Optional arguments to initialize the torch.Module
+            Optional arguments to initialize the pytorch_lightning.Module, pytorch_lightning.Trainer, and
+            Darts' :class:`TorchForecastingModel`.
 
-        batch_size
-            Number of time series (input and output sequences) used in each training pass.
-        n_epochs
-            Number of epochs over which to train the model.
-        add_encoders
-            A large number of past and future covariates can be automatically generated with `add_encoders`.
-            This can be done by adding mutliple pre-defined index encoders and/or custom user-made functions that
-            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
-            transform the generated covariates. This happens all under one hood and only needs to be specified at
-            model creation.
-            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
-            `add_encoders`. An example showing some of `add_encoders` features:
-
-            .. highlight:: python
-            .. code-block:: python
-
-                add_encoders={
-                    'cyclic': {'future': ['month']},
-                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
-                    'position': {'past': ['absolute'], 'future': ['relative']},
-                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
-                    'transformer': Scaler()
-                }
-            ..
         optimizer_cls
-            The PyTorch optimizer class to be used (default: `torch.optim.Adam`).
+            The PyTorch optimizer class to be used (default: ``torch.optim.Adam``).
         optimizer_kwargs
             Optionally, some keyword arguments for the PyTorch optimizer (e.g., ``{'lr': 1e-3}``
-            for specifying a learning rate). Otherwise the default values of the selected `optimizer_cls`
+            for specifying a learning rate). Otherwise the default values of the selected ``optimizer_cls``
             will be used.
         lr_scheduler_cls
-            Optionally, the PyTorch learning rate scheduler class to be used. Specifying `None` corresponds
+            Optionally, the PyTorch learning rate scheduler class to be used. Specifying ``None`` corresponds
             to using a constant learning rate.
         lr_scheduler_kwargs
-            Optionally, some keyword arguments for the PyTorch optimizer.
+            Optionally, some keyword arguments for the PyTorch learning rate scheduler.
+        batch_size
+            Number of time series (input and output sequences) used in each training pass.
+        n_epochs
+            Number of epochs over which to train the model.
         model_name
             Name of the model. Used for creating checkpoints and saving tensorboard data. If not specified,
-            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part of
-            the name is formatted with the local date and time, while PID is the processed ID (preventing models spawned
-            at the same time by different processes to share the same model_name). E.g.,
+            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part
+            of the name is formatted with the local date and time, while PID is the processed ID (preventing models
+            spawned at the same time by different processes to share the same model_name). E.g.,
             ``"2021-06-14_09:53:32_torch_model_run_44607"``.
         work_dir
             Path of the working directory, where to save checkpoints and Tensorboard summaries.
             (default: current working directory).
         log_tensorboard
             If set, use Tensorboard to log the different parameters. The logs will be located in:
-            `[work_dir]/.darts/runs/`.
+            ``"{work_dir}/darts_logs/{model_name}/logs/"``.
         nr_epochs_val_period
             Number of epochs to wait before evaluating the validation loss (if a validation
             ``TimeSeries`` is passed to the :func:`fit()` method).
@@ -691,28 +642,90 @@ def __init__(
             Optionally, a string indicating the torch device to use. (default: "cuda:0" if a GPU
             is available, otherwise "cpu")
         force_reset
-            If set to `True`, any previously-existing model with the same name will be reset (all checkpoints will
+            If set to ``True``, any previously-existing model with the same name will be reset (all checkpoints will
             be discarded).
         save_checkpoints
             Whether or not to automatically save the untrained model and checkpoints from training.
-            If set to `False`, the model can still be manually saved using :func:`save_model()`
-            and loaded using :func:`load_model()`.
+            To load the model from checkpoint, call :func:`MyModelClass.load_from_checkpoint()`, where
+            :class:`MyModelClass` is the :class:`TorchForecastingModel` class that was used (such as :class:`TFTModel`,
+            :class:`NBEATSModel`, etc.). If set to ``False``, the model can still be manually saved using
+            :func:`save_model()` and loaded using :func:`load_model()`.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
+        random_state
+            Control the randomness of the weights initialization. Check this
+            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
+        pl_trainer_kwargs
+            By default :class:`TorchForecastingModel` creates a PyTorch Lightning Trainer with several useful presets
+            that performs the training, validation and prediction processes. These presets include automatic
+            checkpointing, tensorboard logging, setting the torch device and more.
+            With ``pl_trainer_kwargs`` you can add additional kwargs to instantiate the PyTorch Lightning trainer
+            object. Check the `PL Trainer documentation
+            <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ for more information about the
+            supported kwargs.
+            With parameter ``"callbacks"`` you can add custom or PyTorch-Lightning built-in callbacks to Darts'
+            :class:`TorchForecastingModel`. Below is an example for adding EarlyStopping to the training process.
+            The model will stop training early if the validation loss `val_loss` does not improve beyond
+            specifications. For more information on callbacks, visit:
+            `PyTorch Lightning Callbacks
+            <https://pytorch-lightning.readthedocs.io/en/stable/extensions/callbacks.html>`_
+
+            .. highlight:: python
+            .. code-block:: python
+
+                from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+                # stop training when validation loss does not decrease more than 0.05 (`min_delta`) over
+                # a period of 5 epochs (`patience`)
+                my_stopper = EarlyStopping(
+                    monitor="val_loss",
+                    patience=5,
+                    min_delta=0.05,
+                    mode='min',
+                )
+
+                pl_trainer_kwargs={"callbacks": [my_stopper]}
+            ..
+
+            Note that you can also use a custom PyTorch Lightning Trainer for training and prediction with optional
+            parameter ``trainer`` in :func:`fit()` and :func:`predict()`.
+        show_warnings
+            whether to show warnings raised from PyTorch Lightning. Useful to detect potential issues of
+            your forecasting use case.
 
         References
         ----------
         .. [1] https://arxiv.org/pdf/1912.09363.pdf
         """
+        model_kwargs = {key: val for key, val in self.model_params.items()}
         if likelihood is None and loss_fn is None:
             # This is the default if no loss information is provided
-            likelihood = QuantileRegression()
+            model_kwargs["loss_fn"] = None
+            model_kwargs["likelihood"] = QuantileRegression()
+
+        super().__init__(**self._extract_torch_model_params(**model_kwargs))
 
-        kwargs["loss_fn"] = loss_fn
-        kwargs["input_chunk_length"] = input_chunk_length
-        kwargs["output_chunk_length"] = output_chunk_length
-        super().__init__(likelihood=likelihood, **kwargs)
+        # extract pytorch lightning module kwargs
+        self.pl_module_params = self._extract_pl_module_params(**model_kwargs)
 
-        self.input_chunk_length = input_chunk_length
-        self.output_chunk_length = output_chunk_length
         self.hidden_size = hidden_size
         self.lstm_layers = lstm_layers
         self.num_attention_heads = num_attention_heads
@@ -720,8 +733,6 @@ def __init__(
         self.dropout = dropout
         self.hidden_continuous_size = hidden_continuous_size
         self.add_relative_index = add_relative_index
-        self.loss_fn = loss_fn
-        self.likelihood = likelihood
         self.output_dim: Optional[Tuple[int, int]] = None
 
     def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Module:
@@ -850,16 +861,14 @@ def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Modu
         return _TFTModule(
             variables_meta=variables_meta,
             output_dim=self.output_dim,
-            input_chunk_length=self.input_chunk_length,
-            output_chunk_length=self.output_chunk_length,
             hidden_size=self.hidden_size,
             lstm_layers=self.lstm_layers,
             dropout=self.dropout,
             num_attention_heads=self.num_attention_heads,
             full_attention=self.full_attention,
             hidden_continuous_size=self.hidden_continuous_size,
-            likelihood=self.likelihood,
             add_relative_index=self.add_relative_index,
+            **self.pl_module_params,
         )
 
     def _build_train_dataset(
@@ -911,9 +920,6 @@ def _build_inference_dataset(
             output_chunk_length=self.output_chunk_length,
         )
 
-    def _produce_train_output(self, input_batch: Tuple):
-        return self.model(input_batch)
-
     def predict(self, n, *args, **kwargs):
         # since we have future covariates, the inference dataset for future input must be at least of length
         # `output_chunk_length`. If not, we would have to step back which causes past input to be shorter than
@@ -923,159 +929,3 @@ def predict(self, n, *args, **kwargs):
             return super().predict(n, *args, **kwargs)
         else:
             return super().predict(self.output_chunk_length, *args, **kwargs)[:n]
-
-    @random_method
-    def _produce_predict_output(self, x):
-        if self.likelihood:
-            output = self.model(x)
-            return self.likelihood.sample(output)
-        else:
-            return self.model(x).squeeze(dim=-1)
-
-    def _get_batch_prediction(
-        self, n: int, input_batch: Tuple, roll_size: int
-    ) -> torch.Tensor:
-        """
-        Feeds MixedCovariatesModel with input and output chunks of a MixedCovariatesSequentialDataset to farecast
-        the next `n` target values per target variable.
-
-        Parameters:
-        ----------
-        n
-            prediction length
-        input_batch
-            (past_target, past_covariates, historic_future_covariates, future_covariates, future_past_covariates)
-        roll_size
-            roll input arrays after every sequence by `roll_size`. Initially, `roll_size` is equivalent to
-            `self.output_chunk_length`
-        """
-        dim_component = 2
-        (
-            past_target,
-            past_covariates,
-            historic_future_covariates,
-            future_covariates,
-            future_past_covariates,
-        ) = input_batch
-
-        n_targets = past_target.shape[dim_component]
-        n_past_covs = (
-            past_covariates.shape[dim_component] if past_covariates is not None else 0
-        )
-        n_future_covs = (
-            future_covariates.shape[dim_component]
-            if future_covariates is not None
-            else 0
-        )
-
-        input_past = torch.cat(
-            [
-                ds
-                for ds in [past_target, past_covariates, historic_future_covariates]
-                if ds is not None
-            ],
-            dim=dim_component,
-        )
-
-        input_future = (
-            future_covariates[:, :roll_size, :]
-            if future_covariates is not None
-            else None
-        )
-
-        out = self._produce_predict_output(
-            x=(past_target, past_covariates, historic_future_covariates, input_future)
-        )[:, self.first_prediction_index :, :]
-
-        batch_prediction = [out[:, :roll_size, :]]
-        prediction_length = roll_size
-
-        while prediction_length < n:
-            # we want the last prediction to end exactly at `n` into the future.
-            # this means we may have to truncate the previous prediction and step
-            # back the roll size for the last chunk
-            if prediction_length + self.output_chunk_length > n:
-                spillover_prediction_length = (
-                    prediction_length + self.output_chunk_length - n
-                )
-                roll_size -= spillover_prediction_length
-                prediction_length -= spillover_prediction_length
-                batch_prediction[-1] = batch_prediction[-1][:, :roll_size, :]
-
-            # ==========> PAST INPUT <==========
-            # roll over input series to contain latest target and covariate
-            input_past = torch.roll(input_past, -roll_size, 1)
-
-            # update target input to include next `roll_size` predictions
-            if self.input_chunk_length >= roll_size:
-                input_past[:, -roll_size:, :n_targets] = out[:, :roll_size, :]
-            else:
-                input_past[:, :, :n_targets] = out[:, -self.input_chunk_length :, :]
-
-            # set left and right boundaries for extracting future elements
-            if self.input_chunk_length >= roll_size:
-                left_past, right_past = prediction_length - roll_size, prediction_length
-            else:
-                left_past, right_past = (
-                    prediction_length - self.input_chunk_length,
-                    prediction_length,
-                )
-
-            # update past covariates to include next `roll_size` future past covariates elements
-            if n_past_covs and self.input_chunk_length >= roll_size:
-                input_past[
-                    :, -roll_size:, n_targets : n_targets + n_past_covs
-                ] = future_past_covariates[:, left_past:right_past, :]
-            elif n_past_covs:
-                input_past[
-                    :, :, n_targets : n_targets + n_past_covs
-                ] = future_past_covariates[:, left_past:right_past, :]
-
-            # update historic future covariates to include next `roll_size` future covariates elements
-            if n_future_covs and self.input_chunk_length >= roll_size:
-                input_past[
-                    :, -roll_size:, n_targets + n_past_covs :
-                ] = future_covariates[:, left_past:right_past, :]
-            elif n_future_covs:
-                input_past[:, :, n_targets + n_past_covs :] = future_covariates[
-                    :, left_past:right_past, :
-                ]
-
-            # ==========> FUTURE INPUT <==========
-            left_future, right_future = (
-                right_past,
-                right_past + self.output_chunk_length,
-            )
-            # update future covariates to include next `roll_size` future covariates elements
-            if n_future_covs:
-                input_future = future_covariates[:, left_future:right_future, :]
-
-            # convert back into separate datasets
-            input_past_target = input_past[:, :, :n_targets]
-            input_past_covs = (
-                input_past[:, :, n_targets : n_targets + n_past_covs]
-                if n_past_covs
-                else None
-            )
-            input_historic_future_covs = (
-                input_past[:, :, n_targets + n_past_covs :] if n_future_covs else None
-            )
-            input_future_covs = input_future if n_future_covs else None
-
-            # take only last part of the output sequence where needed
-            out = self._produce_predict_output(
-                x=(
-                    input_past_target,
-                    input_past_covs,
-                    input_historic_future_covs,
-                    input_future_covs,
-                )
-            )[:, self.first_prediction_index :, :]
-
-            batch_prediction.append(out)
-            prediction_length += self.output_chunk_length
-
-        # bring predictions into desired format and drop unnecessary values
-        batch_prediction = torch.cat(batch_prediction, dim=1)
-        batch_prediction = batch_prediction[:, :n, :]
-        return batch_prediction
diff --git a/darts/models/forecasting/torch_forecasting_model.py b/darts/models/forecasting/torch_forecasting_model.py
index 445ec943a4..e15b9dc45f 100644
--- a/darts/models/forecasting/torch_forecasting_model.py
+++ b/darts/models/forecasting/torch_forecasting_model.py
@@ -18,25 +18,31 @@
 """
 
 import datetime
+import inspect
 import os
-import re
 import shutil
 from abc import ABC, abstractmethod
 from glob import glob
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import pytorch_lightning as pl
 import torch
-import torch.nn as nn
-from joblib import Parallel, delayed
+from pytorch_lightning import loggers as pl_loggers
 from torch import Tensor
 from torch.utils.data import DataLoader
-from torch.utils.tensorboard import SummaryWriter
 
-from darts.logging import get_logger, raise_if, raise_if_not, raise_log
+from darts.logging import (
+    get_logger,
+    raise_deprecation_warning,
+    raise_if,
+    raise_if_not,
+    raise_log,
+    suppress_lightning_warnings,
+)
 from darts.models.forecasting.forecasting_model import GlobalForecastingModel
+from darts.models.forecasting.pl_forecasting_module import PLForecastingModule
 from darts.timeseries import TimeSeries
-from darts.utils import _build_tqdm_iterator
 from darts.utils.data.encoders import SequentialEncoder
 from darts.utils.data.inference_dataset import (
     DualCovariatesInferenceDataset,
@@ -64,34 +70,51 @@
 from darts.utils.likelihood_models import Likelihood
 from darts.utils.torch import random_method
 
-DEFAULT_DARTS_FOLDER = ".darts"
+DEFAULT_DARTS_FOLDER = "darts_logs"
 CHECKPOINTS_FOLDER = "checkpoints"
 RUNS_FOLDER = "runs"
+INIT_MODEL_NAME = "_model.pth.tar"
 
 logger = get_logger(__name__)
 
 
 def _get_checkpoint_folder(work_dir, model_name):
-    return os.path.join(work_dir, CHECKPOINTS_FOLDER, model_name)
+    return os.path.join(work_dir, model_name, CHECKPOINTS_FOLDER)
+
+
+def _get_logs_folder(work_dir, model_name):
+    return os.path.join(work_dir, model_name)
 
 
 def _get_runs_folder(work_dir, model_name):
-    return os.path.join(work_dir, RUNS_FOLDER, model_name)
+    return os.path.join(work_dir, model_name)
+
+
+def _get_checkpoint_fname(work_dir, model_name, best=False):
+    checkpoint_dir = _get_checkpoint_folder(work_dir, model_name)
+    path = os.path.join(checkpoint_dir, "best-*" if best else "last-*")
+
+    checklist = glob(path)
+    if len(checklist) == 0:
+        raise_log(
+            FileNotFoundError(
+                "There is no file matching prefix {} in {}".format(
+                    "best-*" if best else "last-*", checkpoint_dir
+                )
+            ),
+            logger,
+        )
+
+    file_name = max(checklist, key=os.path.getctime)
+    return os.path.basename(file_name)
 
 
 class TorchForecastingModel(GlobalForecastingModel, ABC):
-    # TODO: add is_stochastic & reset methods
+    @random_method
     def __init__(
         self,
-        input_chunk_length: int,
-        output_chunk_length: int,
         batch_size: int = 32,
         n_epochs: int = 100,
-        optimizer_cls: torch.optim.Optimizer = torch.optim.Adam,
-        optimizer_kwargs: Optional[Dict] = None,
-        lr_scheduler_cls: torch.optim.lr_scheduler._LRScheduler = None,
-        lr_scheduler_kwargs: Optional[Dict] = None,
-        loss_fn: nn.modules.loss._Loss = nn.MSELoss(),
         model_name: str = None,
         work_dir: str = os.path.join(os.getcwd(), DEFAULT_DARTS_FOLDER),
         log_tensorboard: bool = False,
@@ -100,51 +123,40 @@ def __init__(
         force_reset: bool = False,
         save_checkpoints: bool = False,
         add_encoders: Optional[Dict] = None,
+        random_state: Optional[int] = None,
+        pl_trainer_kwargs: Optional[Dict] = None,
+        show_warnings: bool = False,
     ):
 
-        """Pytorch-based Forecasting Model.
+        """Pytorch Lightning (PL)-based Forecasting Model.
+
+        This class is meant to be inherited to create a new PL-based forecasting model.
+        It governs the interactions between:
+            - Darts forecasting models (module) :class:`PLTorchForecastingModel`
+            - Darts integrated PL Lightning Trainer :class:`pytorch_lightning.Trainer` or custom PL Trainers
+            - Dataset loaders :class:`TrainingDataset` and :class:`InferenceDataset` or custom Dataset Loaders.
 
-        This class is meant to be inherited to create a new pytorch-based forecasting module.
         When subclassing this class, please make sure to set the self.model attribute
         in the __init__ function and then call super().__init__ while passing the kwargs.
 
         Parameters
         ----------
-        input_chunk_length
-            Number of past time steps that are fed to the internal forecasting module.
-        output_chunk_length
-            Number of time steps to be output by the internal forecasting module.
         batch_size
             Number of time series (input and output sequences) used in each training pass.
         n_epochs
             Number of epochs over which to train the model.
-        optimizer_cls
-            The PyTorch optimizer class to be used (default: `torch.optim.Adam`).
-        optimizer_kwargs
-            Optionally, some keyword arguments for the PyTorch optimizer (e.g., ``{'lr': 1e-3}``
-            for specifying a learning rate). Otherwise the default values of the selected `optimizer_cls`
-            will be used.
-        lr_scheduler_cls
-            Optionally, the PyTorch learning rate scheduler class to be used. Specifying `None` corresponds
-            to using a constant learning rate.
-        lr_scheduler_kwargs
-            Optionally, some keyword arguments for the PyTorch optimizer.
-        loss_fn
-            PyTorch loss function used for training.
-            This parameter will be ignored for probabilistic models if the `likelihood` parameter is specified.
-            Default: ``torch.nn.MSELoss()``.
         model_name
             Name of the model. Used for creating checkpoints and saving tensorboard data. If not specified,
-            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part of
-            the name is formatted with the local date and time, while PID is the processed ID (preventing models spawned
-            at the same time by different processes to share the same model_name). E.g.,
+            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part
+            of the name is formatted with the local date and time, while PID is the processed ID (preventing models
+            spawned at the same time by different processes to share the same model_name). E.g.,
             ``"2021-06-14_09:53:32_torch_model_run_44607"``.
         work_dir
             Path of the working directory, where to save checkpoints and Tensorboard summaries.
             (default: current working directory).
         log_tensorboard
             If set, use Tensorboard to log the different parameters. The logs will be located in:
-            `[work_dir]/.darts/runs/`.
+            ``"{work_dir}/darts_logs/{model_name}/logs/"``.
         nr_epochs_val_period
             Number of epochs to wait before evaluating the validation loss (if a validation
             ``TimeSeries`` is passed to the :func:`fit()` method).
@@ -152,30 +164,91 @@ def __init__(
             Optionally, a string indicating the torch device to use. (default: "cuda:0" if a GPU
             is available, otherwise "cpu")
         force_reset
-            If set to `True`, any previously-existing model with the same name will be reset (all checkpoints will
+            If set to ``True``, any previously-existing model with the same name will be reset (all checkpoints will
             be discarded).
         save_checkpoints
             Whether or not to automatically save the untrained model and checkpoints from training.
-            If set to `False`, the model can still be manually saved using :func:`save_model()`
-            and loaded using :func:`load_model()`.
+            To load the model from checkpoint, call :func:`MyModelClass.load_from_checkpoint()`, where
+            :class:`MyModelClass` is the :class:`TorchForecastingModel` class that was used (such as :class:`TFTModel`,
+            :class:`NBEATSModel`, etc.). If set to ``False``, the model can still be manually saved using
+            :func:`save_model()` and loaded using :func:`load_model()`.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
+        random_state
+            Control the randomness of the weights initialization. Check this
+            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
+        pl_trainer_kwargs
+            By default :class:`TorchForecastingModel` creates a PyTorch Lightning Trainer with several useful presets
+            that performs the training, validation and prediction processes. These presets include automatic
+            checkpointing, tensorboard logging, setting the torch device and more.
+            With ``pl_trainer_kwargs`` you can add additional kwargs to instantiate the PyTorch Lightning trainer
+            object. Check the `PL Trainer documentation
+            <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ for more information about the
+            supported kwargs.
+            With parameter ``"callbacks"`` you can add custom or PyTorch-Lightning built-in callbacks to Darts'
+            :class:`TorchForecastingModel`. Below is an example for adding EarlyStopping to the training process.
+            The model will stop training early if the validation loss `val_loss` does not improve beyond
+            specifications. For more information on callbacks, visit:
+            `PyTorch Lightning Callbacks
+            <https://pytorch-lightning.readthedocs.io/en/stable/extensions/callbacks.html>`_
+
+            .. highlight:: python
+            .. code-block:: python
+
+                from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+                # stop training when validation loss does not decrease more than 0.05 (`min_delta`) over
+                # a period of 5 epochs (`patience`)
+                my_stopper = EarlyStopping(
+                    monitor="val_loss",
+                    patience=5,
+                    min_delta=0.05,
+                    mode='min',
+                )
+
+                pl_trainer_kwargs={"callbacks": [my_stopper]}
+            ..
+
+            Note that you can also use a custom PyTorch Lightning Trainer for training and prediction with optional
+            parameter ``trainer`` in :func:`fit()` and :func:`predict()`.
+        show_warnings
+            whether to show warnings raised from PyTorch Lightning. Useful to detect potential issues of
+            your forecasting use case.
         """
         super().__init__()
-
-        if torch_device_str is None:
-            self.device = self._get_best_torch_device()
-        else:
-            self.device = torch.device(torch_device_str)
+        suppress_lightning_warnings(suppress_all=not show_warnings)
 
         # We will fill these dynamically, upon first call of fit_from_dataset():
-        self.model = None
-        self.train_sample = None
-        self.output_dim = None
+        self.model: Optional[PLForecastingModule] = None
+        self.train_sample: Optional[Tuple] = None
+        self.output_dim: Optional[int] = None
 
-        self.input_chunk_length = input_chunk_length
-        self.output_chunk_length = output_chunk_length
-        self.log_tensorboard = log_tensorboard
-        self.nr_epochs_val_period = nr_epochs_val_period
+        self.n_epochs = n_epochs
+        self.batch_size = batch_size
 
+        # by default models do not use encoders
+        self.add_encoders = add_encoders
+        self.encoders: Optional[SequentialEncoder] = None
+
+        # get model name and work dir
         if model_name is None:
             current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S.%f")
             model_name = current_time + "_torch_model_run_" + str(os.getpid())
@@ -183,139 +256,248 @@ def __init__(
         self.model_name = model_name
         self.work_dir = work_dir
 
-        self.n_epochs = n_epochs
-        self.total_epochs = 0  # 0 means it wasn't trained yet.
-        self.batch_size = batch_size
+        # setup model save dirs
+        self.save_checkpoints = save_checkpoints
+        checkpoints_folder = _get_checkpoint_folder(self.work_dir, self.model_name)
+        log_folder = _get_logs_folder(self.work_dir, self.model_name)
+        checkpoint_exists = (
+            os.path.exists(checkpoints_folder)
+            and len(glob(os.path.join(checkpoints_folder, "*"))) > 0
+        )
 
-        # Define the loss function
-        self.criterion = loss_fn
+        # setup model save dirs
+        if checkpoint_exists and save_checkpoints:
+            raise_if_not(
+                force_reset,
+                f"Some model data already exists for `model_name` '{self.model_name}'. Either load model to continue "
+                f"training or use `force_reset=True` to initialize anyway to start training from scratch and remove "
+                f"all the model data",
+                logger,
+            )
+            self.reset_model()
+        elif save_checkpoints:
+            self._create_save_dirs()
+        else:
+            pass
 
-        # The tensorboard writer
-        self.tb_writer = None
+        # TODO: remove below in the next version ======>
+        accelerator, gpus, auto_select_gpus = self._extract_torch_devices(
+            torch_device_str
+        )
+        # TODO: until here <======
+
+        # save best epoch on val_loss and last epoch under 'darts_logs/model_name/checkpoints/'
+        if save_checkpoints:
+            checkpoint_callback = pl.callbacks.ModelCheckpoint(
+                dirpath=checkpoints_folder,
+                save_last=True,
+                monitor="val_loss",
+                filename="best-{epoch}-{val_loss:.2f}",
+            )
+            checkpoint_callback.CHECKPOINT_NAME_LAST = "last-{epoch}"
+        else:
+            checkpoint_callback = None
 
-        # Persist optimiser and LR scheduler parameters
-        self.optimizer_cls = optimizer_cls
-        self.optimizer_kwargs = dict() if optimizer_kwargs is None else optimizer_kwargs
-        self.lr_scheduler_cls = lr_scheduler_cls
-        self.lr_scheduler_kwargs = (
-            dict() if lr_scheduler_kwargs is None else lr_scheduler_kwargs
+        # save tensorboard under 'darts_logs/model_name/logs/'
+        model_logger = (
+            pl_loggers.TensorBoardLogger(save_dir=log_folder, name="", version="logs")
+            if log_tensorboard
+            else False
         )
 
-        # by default models are deterministic (i.e. not probabilistic)
-        self.likelihood = None
+        # setup trainer parameters from model creation parameters
+        self.trainer_params = {
+            "accelerator": accelerator,
+            "gpus": gpus,
+            "auto_select_gpus": auto_select_gpus,
+            "logger": model_logger,
+            "max_epochs": n_epochs,
+            "check_val_every_n_epoch": nr_epochs_val_period,
+            "enable_checkpointing": save_checkpoints,
+            "callbacks": [cb for cb in [checkpoint_callback] if cb is not None],
+        }
+
+        # update trainer parameters with user defined `pl_trainer_kwargs`
+        if pl_trainer_kwargs is not None:
+            pl_trainer_kwargs_copy = {
+                key: val for key, val in pl_trainer_kwargs.items()
+            }
+            self.n_epochs = pl_trainer_kwargs_copy.get("max_epochs", self.n_epochs)
+            self.trainer_params["callbacks"] += pl_trainer_kwargs_copy.pop(
+                "callbacks", []
+            )
+            self.trainer_params = dict(self.trainer_params, **pl_trainer_kwargs_copy)
 
-        # by default models do not use encoders
-        self.encoders = None
+        # pytorch lightning trainer will be created at training time
+        self.trainer: Optional[pl.Trainer] = None
+        self.load_ckpt_path: Optional[str] = None
 
-        self.force_reset = force_reset
-        self.save_checkpoints = save_checkpoints
-        checkpoints_folder = _get_checkpoint_folder(self.work_dir, self.model_name)
-        self.checkpoint_exists = (
-            os.path.exists(checkpoints_folder)
-            and len(glob(os.path.join(checkpoints_folder, "checkpoint_*"))) > 0
-        )
+        # pl_module_params must be set in __init__ method of TorchForecastingModel subclass
+        self.pl_module_params: Optional[Dict] = None
 
-        if self.checkpoint_exists and self.save_checkpoints:
-            if self.force_reset:
-                self.reset_model()
-            else:
-                raise AttributeError(
-                    "You already have model data for the '{}' name. Either load model to continue"
-                    " training or use `force_reset=True` to initialize anyway to start"
-                    " training from scratch and remove all the model data".format(
-                        self.model_name
-                    )
-                )
+    @staticmethod
+    def _extract_torch_devices(torch_device_str) -> Tuple[str, Optional[list], bool]:
+        """This method handles the deprecated `torch_device_str` and should be removed in a future Darts version.
 
-    @property
-    def min_train_series_length(self) -> int:
-        """
-        Class property defining the minimum required length for the training series;
-        overriding the default value of 3 of ForecastingModel
+        Returns
+        -------
+        Tuple
+            (accelerator, gpus, auto_select_gpus)
         """
-        return self.input_chunk_length + self.output_chunk_length
 
-    def _batch_collate_fn(self, batch: List[Tuple]) -> Tuple:
-        """
-        Returns a batch Tuple from a list of samples
-        """
-        aggregated = []
-        first_sample = batch[0]
-        for i in range(len(first_sample)):
-            elem = first_sample[i]
-            if isinstance(elem, np.ndarray):
-                aggregated.append(
-                    torch.from_numpy(np.stack([sample[i] for sample in batch], axis=0))
-                )
-            elif elem is None:
-                aggregated.append(None)
-            elif isinstance(elem, TimeSeries):
-                aggregated.append([sample[i] for sample in batch])
-        return tuple(aggregated)
+        if torch_device_str is None:
+            return "auto", None, False
+
+        device_warning = (
+            "`torch_device_str` is deprecated and will be removed in a coming Darts version. For full support "
+            "of all torch devices, use PyTorch-Lightnings trainer flags and pass them inside "
+            "`pl_trainer_kwargs`. Flags of interest are {`accelerator`, `gpus`, `auto_select_gpus`, `devices`}. "
+            "For more information, visit "
+            "https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#trainer-flags"
+        )
+        raise_deprecation_warning(device_warning, logger)
+        # check torch device
+        raise_if_not(
+            any(
+                [
+                    device_str in torch_device_str
+                    for device_str in ["cuda", "cpu", "auto"]
+                ]
+            ),
+            f"unknown torch_device_str `{torch_device_str}`. String must contain one of `('cuda', 'cpu', 'auto') "
+            + device_warning,
+            logger,
+        )
+        device_split = torch_device_str.split(":")
+
+        gpus = None
+        auto_select_gpus = False
+        accelerator = device_split[0]
+        if len(device_split) == 2 and accelerator == "cuda":
+            gpus = device_split[1]
+            gpus = [int(gpus)]
+        elif len(device_split) == 1:
+            if accelerator == "cuda":
+                accelerator = "gpu"
+                gpus = -1
+                auto_select_gpus = True
+        else:
+            raise_if(
+                True,
+                f"unknown torch_device_str `{torch_device_str}`. " + device_warning,
+                logger,
+            )
+        return accelerator, gpus, auto_select_gpus
 
-    def reset_model(self):
-        """Resets the model object and removes all the stored data - model, checkpoints and training history."""
-        shutil.rmtree(
-            _get_checkpoint_folder(self.work_dir, self.model_name), ignore_errors=True
+    @staticmethod
+    def _extract_torch_model_params(**kwargs):
+        """extract params from model creation to set up TorchForecastingModels"""
+        get_params = list(
+            inspect.signature(TorchForecastingModel.__init__).parameters.keys()
+        )
+        get_params.remove("self")
+        return {kwarg: kwargs.get(kwarg) for kwarg in get_params if kwarg in kwargs}
+
+    @staticmethod
+    def _extract_pl_module_params(**kwargs):
+        """Extract params from model creation to set up PLForecastingModule (the actual torch.nn.Module)"""
+        get_params = list(
+            inspect.signature(PLForecastingModule.__init__).parameters.keys()
         )
+        get_params.remove("self")
+        return {kwarg: kwargs.get(kwarg) for kwarg in get_params if kwarg in kwargs}
+
+    def _create_save_dirs(self):
+        """Create work dir and model dir"""
+        if not os.path.exists(self.work_dir):
+            os.mkdir(self.work_dir)
+        if not os.path.exists(_get_runs_folder(self.work_dir, self.model_name)):
+            os.mkdir(_get_runs_folder(self.work_dir, self.model_name))
+
+    def _remove_save_dirs(self):
         shutil.rmtree(
             _get_runs_folder(self.work_dir, self.model_name), ignore_errors=True
         )
 
-        self.checkpoint_exists = False
-        self.total_epochs = 0
+    def reset_model(self):
+        """Resets the model object and removes all stored data - model, checkpoints, loggers and training history."""
+        self._remove_save_dirs()
+        self._create_save_dirs()
+
         self.model = None
+        self.trainer = None
         self.train_sample = None
 
-    def _init_model(self) -> None:
-        """
-        Init self.model - the torch module of this class, based on examples of input/output tensors (to get the
-        sizes right).
-        """
+    def _init_model(self, trainer: Optional[pl.Trainer] = None) -> None:
+        """Initializes model and trainer based on examples of input/output tensors (to get the sizes right):"""
+
+        raise_if(
+            self.pl_module_params is None,
+            "`pl_module_params` must be extracted in __init__ method of `TorchForecastingModel` subclass after "
+            "calling `super.__init__(...)`. Do this with `self._extract_pl_module_params(**self.model_params).`",
+        )
 
         # the tensors have shape (chunk_length, nr_dimensions)
         self.model = self._create_model(self.train_sample)
 
-        if np.issubdtype(self.train_sample[0].dtype, np.float32):
+        precision = None
+        dtype = self.train_sample[0].dtype
+        if np.issubdtype(dtype, np.float32):
             logger.info("Time series values are 32-bits; casting model to float32.")
-            self.model = self.model.float()
-
-        elif np.issubdtype(self.train_sample[0].dtype, np.float64):
+            precision = 32
+        elif np.issubdtype(dtype, np.float64):
             logger.info("Time series values are 64-bits; casting model to float64.")
-            self.model = self.model.double()
-
-        self.model = self.model.to(self.device)
-
-        # A utility function to create optimizer and lr scheduler from desired classes
-        def _create_from_cls_and_kwargs(cls, kws):
-            try:
-                return cls(**kws)
-            except (TypeError, ValueError) as e:
-                raise_log(
-                    ValueError(
-                        "Error when building the optimizer or learning rate scheduler;"
-                        "please check the provided class and arguments"
-                        "\nclass: {}"
-                        "\narguments (kwargs): {}"
-                        "\nerror:\n{}".format(cls, kws, e)
-                    ),
-                    logger,
-                )
+            precision = 64
 
-        # Create the optimizer and (optionally) the learning rate scheduler
-        # we have to create copies because we cannot save model.parameters into object state (not serializable)
-        optimizer_kws = {k: v for k, v in self.optimizer_kwargs.items()}
-        optimizer_kws["params"] = self.model.parameters()
-        self.optimizer = _create_from_cls_and_kwargs(self.optimizer_cls, optimizer_kws)
-
-        if self.lr_scheduler_cls is not None:
-            lr_sched_kws = {k: v for k, v in self.lr_scheduler_kwargs.items()}
-            lr_sched_kws["optimizer"] = self.optimizer
-            self.lr_scheduler = _create_from_cls_and_kwargs(
-                self.lr_scheduler_cls, lr_sched_kws
+        precision_user = (
+            self.trainer_params.get("precision", None)
+            if trainer is None
+            else trainer.precision
+        )
+        raise_if(
+            precision_user is not None and precision_user != precision,
+            f"User-defined trainer_kwarg `precision={precision_user}`-bit does not match dtype: `{dtype}` of the "
+            f"underlying TimeSeries. Set `precision` to `{precision}` or cast your data to `{precision_user}-"
+            f"bit` with `TimeSeries.astype(np.float{precision_user})`.",
+            logger,
+        )
+
+        self.trainer_params["precision"] = precision
+
+        # we need to save the initialized TorchForecastingModel as PyTorch-Lightning only saves module checkpoints
+        if self.save_checkpoints:
+            self.save_model(
+                os.path.join(
+                    _get_runs_folder(self.work_dir, self.model_name), INIT_MODEL_NAME
+                )
             )
-        else:
-            self.lr_scheduler = None  # We won't use a LR scheduler
+
+    def _setup_trainer(
+        self, trainer: Optional[pl.Trainer], verbose: bool, epochs: int = 0
+    ) -> None:
+        """Sets up the PyTorch-Lightning trainer for training or prediction."""
+
+        self.trainer_params["enable_model_summary"] = (
+            verbose if self.model.epochs_trained == 0 else False
+        )
+        self.trainer_params["enable_progress_bar"] = verbose
+
+        self.trainer = (
+            self._init_trainer(trainer_params=self.trainer_params, max_epochs=epochs)
+            if trainer is None
+            else trainer
+        )
+
+    @staticmethod
+    def _init_trainer(
+        trainer_params: Dict, max_epochs: Optional[int] = None
+    ) -> pl.Trainer:
+        """Initializes the PyTorch-Lightning trainer for training or prediction from `trainer_params`."""
+        trainer_params_copy = {param: val for param, val in trainer_params.items()}
+        if max_epochs is not None:
+            trainer_params_copy["max_epochs"] = max_epochs
+
+        return pl.Trainer(**trainer_params_copy)
 
     @abstractmethod
     def _create_model(self, train_sample: Tuple[Tensor]) -> torch.nn.Module:
@@ -380,20 +562,6 @@ def _verify_past_future_covariates(self, past_covariates, future_covariates):
         """
         pass
 
-    @abstractmethod
-    def _produce_train_output(self, input_batch: Tuple) -> Tensor:
-        pass
-
-    @abstractmethod
-    def _get_batch_prediction(
-        self, n: int, input_batch: Tuple, roll_size: int
-    ) -> Tensor:
-        """
-        In charge of apply the recurrent logic for non-recurrent models.
-        Should be overwritten by recurrent models.
-        """
-        pass
-
     @random_method
     def fit(
         self,
@@ -403,7 +571,8 @@ def fit(
         val_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
         val_past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
         val_future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
-        verbose: bool = False,
+        trainer: Optional[pl.Trainer] = None,
+        verbose: Optional[bool] = None,
         epochs: int = 0,
         max_samples_per_ts: Optional[int] = None,
         num_loader_workers: int = 0,
@@ -414,12 +583,17 @@ def fit(
         dataset for this model. If you need more control on how the series are sliced for training, consider
         calling :func:`fit_from_dataset()` with a custom :class:`darts.utils.data.TrainingDataset`.
 
-        This function can be called several times to do some extra training. If `epochs` is specified, the model
-        will be trained for some (extra) `epochs` epochs.
+        Training is performed with a PyTorch Lightning Trainer. It uses a default Trainer object from presets and
+        ``pl_trainer_kwargs`` used at model creation. You can also use a custom Trainer with optional parameter
+        ``trainer``. For more information on PyTorch Lightning Trainers check out `this link
+        <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ .
+
+        This function can be called several times to do some extra training. If ``epochs`` is specified, the model
+        will be trained for some (extra) ``epochs`` epochs.
 
         Below, all possible parameters are documented, but not all models support all parameters. For instance,
-        all the :class:`PastCovariatesTorchModel` support only `past_covariates` and not `future_covariates`. Darts will
-        complain if you try fitting a model with the wrong covariates argument.
+        all the :class:`PastCovariatesTorchModel` support only ``past_covariates`` and not ``future_covariates``.
+        Darts will complain if you try fitting a model with the wrong covariates argument.
 
         When handling covariates, Darts will try to use the time axes of the target and the covariates
         to come up with the right time slices. So the covariates can be longer than needed; as long as the time axes
@@ -437,13 +611,16 @@ def fit(
             Optionally, one or a sequence of validation target series, which will be used to compute the validation
             loss throughout training and keep track of the best performing models.
         val_past_covariates
-            Optionally, the past covariates corresponding to the validation series (must match `covariates`)
+            Optionally, the past covariates corresponding to the validation series (must match ``covariates``)
         val_future_covariates
-            Optionally, the future covariates corresponding to the validation series (must match `covariates`)
+            Optionally, the future covariates corresponding to the validation series (must match ``covariates``)
+        trainer
+            Optionally, a custom PyTorch-Lightning Trainer object to perform training. Using a custom ``trainer`` will
+            override Darts' default trainer.
         verbose
             Optionally, whether to print progress.
         epochs
-            If specified, will train the model for `epochs` (additional) epochs, irrespective of what `n_epochs`
+            If specified, will train the model for ``epochs`` (additional) epochs, irrespective of what ``n_epochs``
             was provided to the model constructor.
         max_samples_per_ts
             Optionally, a maximum number of samples to use per time series. Models are trained in a supervised fashion
@@ -452,7 +629,7 @@ def fit(
             series (taking only the most recent samples in each series). Leaving to None does not apply any
             upper bound.
         num_loader_workers
-            Optionally, an integer specifying the `num_workers` to use in PyTorch ``DataLoader`` instances,
+            Optionally, an integer specifying the ``num_workers`` to use in PyTorch ``DataLoader`` instances,
             both for the training and validation loaders (if any).
             A larger number of workers can sometimes increase performance, but can also incur extra overheads
             and increase memory usage, as more batches are loaded in parallel.
@@ -547,33 +724,7 @@ def wrap_fn(
         logger.info(f"Train dataset contains {len(train_dataset)} samples.")
 
         return self.fit_from_dataset(
-            train_dataset, val_dataset, verbose, epochs, num_loader_workers
-        )
-
-    @property
-    @abstractmethod
-    def _model_encoder_settings(self) -> Tuple[int, int, bool, bool]:
-        """Abstract property that returns model specific encoder settings that are used to initialize the encoders.
-
-        Must return Tuple (input_chunk_length, output_chunk_length, takes_past_covariates, takes_future_covariates)
-        """
-        pass
-
-    def initialize_encoders(self) -> SequentialEncoder:
-
-        (
-            input_chunk_length,
-            output_chunk_length,
-            takes_past_covariates,
-            takes_future_covariates,
-        ) = self._model_encoder_settings
-
-        return SequentialEncoder(
-            add_encoders=self._model_params[1].get("add_encoders", None),
-            input_chunk_length=input_chunk_length,
-            output_chunk_length=output_chunk_length,
-            takes_past_covariates=takes_past_covariates,
-            takes_future_covariates=takes_future_covariates,
+            train_dataset, val_dataset, trainer, verbose, epochs, num_loader_workers
         )
 
     @random_method
@@ -581,7 +732,8 @@ def fit_from_dataset(
         self,
         train_dataset: TrainingDataset,
         val_dataset: Optional[TrainingDataset] = None,
-        verbose: bool = False,
+        trainer: Optional[pl.Trainer] = None,
+        verbose: Optional[bool] = None,
         epochs: int = 0,
         num_loader_workers: int = 0,
     ):
@@ -591,8 +743,13 @@ def fit_from_dataset(
         for training. If you are not sure which training dataset to use, consider calling :func:`fit()` instead,
         which will create a default training dataset appropriate for this model.
 
-        This function can be called several times to do some extra training. If `epochs` is specified, the model
-        will be trained for some (extra) `epochs` epochs.
+        Training is performed with a PyTorch Lightning Trainer. It uses a default Trainer object from presets and
+        ``pl_trainer_kwargs`` used at model creation. You can also use a custom Trainer with optional parameter
+        ``trainer``. For more information on PyTorch Lightning Trainers check out `this link
+        <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ .
+
+        This function can be called several times to do some extra training. If ``epochs`` is specified, the model
+        will be trained for some (extra) ``epochs`` epochs.
 
         Parameters
         ----------
@@ -602,13 +759,16 @@ def fit_from_dataset(
         val_dataset
             A training dataset with a type matching this model (e.g. :class:`PastCovariatesTrainingDataset` for
             :class:`PastCovariatesTorchModel`s), representing the validation set (to track the validation loss).
+        trainer
+            Optionally, a custom PyTorch-Lightning Trainer object to perform prediction. Using a custom `trainer` will
+            override Darts' default trainer.
         verbose
             Optionally, whether to print progress.
         epochs
-            If specified, will train the model for `epochs` (additional) epochs, irrespective of what `n_epochs`
+            If specified, will train the model for ``epochs`` (additional) epochs, irrespective of what ``n_epochs``
             was provided to the model constructor.
         num_loader_workers
-            Optionally, an integer specifying the `num_workers` to use in PyTorch ``DataLoader`` instances,
+            Optionally, an integer specifying the ``num_workers`` to use in PyTorch ``DataLoader`` instances,
             both for the training and validation loaders (if any).
             A larger number of workers can sometimes increase performance, but can also incur extra overheads
             and increase memory usage, as more batches are loaded in parallel.
@@ -635,7 +795,7 @@ def fit_from_dataset(
         if self.model is None:
             # Build model, based on the dimensions of the first series in the train set.
             self.train_sample, self.output_dim = train_sample, train_sample[-1].shape[1]
-            self._init_model()
+            self._init_model(trainer)
         else:
             # Check existing model has input/output dims matching what's provided in the training set.
             raise_if_not(
@@ -687,21 +847,62 @@ def fit_from_dataset(
             )
         )
 
-        # Prepare tensorboard writer
-        tb_writer = self._prepare_tensorboard_writer()
-
         # if user wants to train the model for more epochs, ignore the n_epochs parameter
         train_num_epochs = epochs if epochs > 0 else self.n_epochs
 
+        if verbose is not None:
+            raise_deprecation_warning(
+                "kwarg `verbose` is deprecated and will be removed in a future Darts version. "
+                "Instead, control verbosity with PyTorch Lightning Trainer parameters `enable_progress_bar`, "
+                "`progress_bar_refresh_rate` and `enable_model_summary` in the `pl_trainer_kwargs` dict "
+                "at model creation.",
+                logger,
+            )
+        verbose = True if verbose is None else verbose
+
+        # setup trainer
+        self._setup_trainer(trainer, verbose, train_num_epochs)
+
+        # TODO: multiple training without loading from checkpoint is not trivial (I believe PyTorch-Lightning is still
+        #  working on that, see https://github.com/PyTorchLightning/pytorch-lightning/issues/9636)
+        if self.epochs_trained > 0 and not self.load_ckpt_path:
+            logger.warn(
+                "Attempting to retrain the model without resuming from a checkpoint. This is currently "
+                "discouraged. Consider setting `save_checkpoints` to `True` and specifying `model_name` at model "
+                f"creation. Then call `model = {self.__class__.__name__}.load_from_checkpoint(model_name, "
+                "best=False)`. Finally, train the model with `model.fit(..., epochs=new_epochs)` where "
+                "`new_epochs` is the sum of (epochs already trained + some additional epochs)."
+            )
+
         # Train model
-        self._train(train_loader, val_loader, tb_writer, verbose, train_num_epochs)
+        self._train(train_loader, val_loader)
+        return self
+
+    def _train(
+        self, train_loader: DataLoader, val_loader: Optional[DataLoader]
+    ) -> None:
+        """
+        Performs the actual training
+
+        Parameters
+        ----------
+        train_loader
+            the training data loader feeding the training data and targets
+        val_loader
+            optionally, a validation set loader
+        """
 
-        # Close tensorboard writer
-        if tb_writer is not None:
-            tb_writer.flush()
-            tb_writer.close()
+        # if model was loaded from checkpoint (when `load_ckpt_path is not None`) and model.fit() is called,
+        # we resume training
+        ckpt_path = self.load_ckpt_path
+        self.load_ckpt_path = None
 
-        return self
+        self.trainer.fit(
+            self.model,
+            train_dataloaders=train_loader,
+            val_dataloaders=val_loader,
+            ckpt_path=ckpt_path,
+        )
 
     @random_method
     def predict(
@@ -710,28 +911,34 @@ def predict(
         series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
         past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
         future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
+        trainer: Optional[pl.Trainer] = None,
         batch_size: Optional[int] = None,
-        verbose: bool = False,
+        verbose: Optional[bool] = None,
         n_jobs: int = 1,
         roll_size: Optional[int] = None,
         num_samples: int = 1,
         num_loader_workers: int = 0,
     ) -> Union[TimeSeries, Sequence[TimeSeries]]:
-        """Predict the `n` time step following the end of the training series, or of the specified `series`.
+        """Predict the ``n`` time step following the end of the training series, or of the specified ``series``.
+
+        Prediction is performed with a PyTorch Lightning Trainer. It uses a default Trainer object from presets and
+        ``pl_trainer_kwargs`` used at model creation. You can also use a custom Trainer with optional parameter
+        ``trainer``. For more information on PyTorch Lightning Trainers check out `this link
+        <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ .
 
         Below, all possible parameters are documented, but not all models support all parameters. For instance,
-        all the :class:`PastCovariatesTorchModel` support only `past_covariates` and not `future_covariates`.
+        all the :class:`PastCovariatesTorchModel` support only ``past_covariates`` and not ``future_covariates``.
         Darts will complain if you try calling :func:`predict()` on a model with the wrong covariates argument.
 
         Darts will also complain if the provided covariates do not have a sufficient time span.
         In general, not all models require the same covariates' time spans:
 
-        * | Models relying on past covariates require the last `input_chunk_length` of the `past_covariates`
-          | points to be known at prediction time. For horizon values `n > output_chunk_length`, these models
-          | require at least the next `n - output_chunk_length` future values to be known as well.
-        * | Models relying on future covariates require the next `n` values to be known.
+        * | Models relying on past covariates require the last ``input_chunk_length`` of the ``past_covariates``
+          | points to be known at prediction time. For horizon values ``n > output_chunk_length``, these models
+          | require at least the next ``n - output_chunk_length`` future values to be known as well.
+        * | Models relying on future covariates require the next ``n`` values to be known.
           | In addition (for :class:`DualCovariatesTorchModel` and :class:`MixedCovariatesTorchModel`), they also
-          | require the "historic" values of these future covariates (over the past `input_chunk_length`).
+          | require the "historic" values of these future covariates (over the past ``input_chunk_length``).
 
         When handling covariates, Darts will try to use the time axes of the target and the covariates
         to come up with the right time slices. So the covariates can be longer than needed; as long as the time axes
@@ -751,22 +958,25 @@ def predict(
         future_covariates
             Optionally, the future-known covariates series needed as inputs for the model.
             They must match the covariates used for training in terms of dimension.
+        trainer
+            Optionally, a custom PyTorch-Lightning Trainer object to perform prediction. Using a custom ``trainer``
+            will override Darts' default trainer.
         batch_size
-            Size of batches during prediction. Defaults to the models' training `batch_size` value.
+            Size of batches during prediction. Defaults to the models' training ``batch_size`` value.
         verbose
             Optionally, whether to print progress.
         n_jobs
-            The number of jobs to run in parallel. `-1` means using all processors. Defaults to `1`.
+            The number of jobs to run in parallel. ``-1`` means using all processors. Defaults to ``1``.
         roll_size
-            For self-consuming predictions, i.e. `n > output_chunk_length`, determines how many
+            For self-consuming predictions, i.e. ``n > output_chunk_length``, determines how many
             outputs of the model are fed back into it at every iteration of feeding the predicted target
             (and optionally future covariates) back into the model. If this parameter is not provided,
-            it will be set `output_chunk_length` by default.
+            it will be set ``output_chunk_length`` by default.
         num_samples
             Number of times a prediction is sampled from a probabilistic model. Should be left set to 1
             for deterministic models.
         num_loader_workers
-            Optionally, an integer specifying the `num_workers` to use in PyTorch ``DataLoader`` instances,
+            Optionally, an integer specifying the ``num_workers`` to use in PyTorch ``DataLoader`` instances,
             for the inference/prediction dataset loaders (if any).
             A larger number of workers can sometimes increase performance, but can also incur extra overheads
             and increase memory usage, as more batches are loaded in parallel.
@@ -774,8 +984,8 @@ def predict(
         Returns
         -------
         Union[TimeSeries, Sequence[TimeSeries]]
-            One or several time series containing the forecasts of `series`, or the forecast of the training series
-            if `series` is not specified and the model has been trained on a single series.
+            One or several time series containing the forecasts of ``series``, or the forecast of the training series
+            if ``series`` is not specified and the model has been trained on a single series.
         """
         super().predict(n, series, past_covariates, future_covariates)
 
@@ -825,20 +1035,24 @@ def predict(
         predictions = self.predict_from_dataset(
             n,
             dataset,
+            trainer=trainer,
             verbose=verbose,
             batch_size=batch_size,
             n_jobs=n_jobs,
             roll_size=roll_size,
             num_samples=num_samples,
         )
+
         return predictions[0] if called_with_single_series else predictions
 
+    @random_method
     def predict_from_dataset(
         self,
         n: int,
         input_series_dataset: InferenceDataset,
+        trainer: Optional[pl.Trainer] = None,
         batch_size: Optional[int] = None,
-        verbose: bool = False,
+        verbose: Optional[bool] = None,
         n_jobs: int = 1,
         roll_size: Optional[int] = None,
         num_samples: int = 1,
@@ -847,10 +1061,15 @@ def predict_from_dataset(
 
         """
         This method allows for predicting with a specific :class:`darts.utils.data.InferenceDataset` instance.
-        These datasets implement a PyTorch `Dataset`, and specify how the target and covariates are sliced
+        These datasets implement a PyTorch ``Dataset``, and specify how the target and covariates are sliced
         for inference. In most cases, you'll rather want to call :func:`predict()` instead, which will create an
         appropriate :class:`InferenceDataset` for you.
 
+        Prediction is performed with a PyTorch Lightning Trainer. It uses a default Trainer object from presets and
+        ``pl_trainer_kwargs`` used at model creation. You can also use a custom Trainer with optional parameter
+        ``trainer``. For more information on PyTorch Lightning Trainers check out `this link
+        <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ .
+
         Parameters
         ----------
         n
@@ -859,22 +1078,25 @@ def predict_from_dataset(
             Optionally, a series or sequence of series, representing the history of the target series' whose
             future is to be predicted. If specified, the method returns the forecasts of these
             series. Otherwise, the method returns the forecast of the (single) training series.
+        trainer
+            Optionally, a custom PyTorch-Lightning Trainer object to perform prediction.  Using a custom ``trainer``
+            will override Darts' default trainer.
         batch_size
-            Size of batches during prediction. Defaults to the models `batch_size` value.
+            Size of batches during prediction. Defaults to the models ``batch_size`` value.
         verbose
             Shows the progress bar for batch predicition. Off by default.
         n_jobs
-            The number of jobs to run in parallel. `-1` means using all processors. Defaults to `1`.
+            The number of jobs to run in parallel. ``-1`` means using all processors. Defaults to ``1``.
         roll_size
-            For self-consuming predictions, i.e. `n > output_chunk_length`, determines how many
+            For self-consuming predictions, i.e. ``n > output_chunk_length``, determines how many
             outputs of the model are fed back into it at every iteration of feeding the predicted target
             (and optionally future covariates) back into the model. If this parameter is not provided,
-            it will be set `output_chunk_length` by default.
+            it will be set ``output_chunk_length`` by default.
         num_samples
             Number of times a prediction is sampled from a probabilistic model. Should be left set to 1
             for deterministic models.
         num_loader_workers
-            Optionally, an integer specifying the `num_workers` to use in PyTorch ``DataLoader`` instances,
+            Optionally, an integer specifying the ``num_workers`` to use in PyTorch ``DataLoader`` instances,
             for the inference/prediction dataset loaders (if any).
             A larger number of workers can sometimes increase performance, but can also incur extra overheads
             and increase memory usage, as more batches are loaded in parallel.
@@ -903,6 +1125,15 @@ def predict_from_dataset(
         # iterate through batches to produce predictions
         batch_size = batch_size or self.batch_size
 
+        # set prediction parameters
+        self.model.set_predict_parameters(
+            n=n,
+            num_samples=num_samples,
+            roll_size=roll_size,
+            batch_size=batch_size,
+            n_jobs=n_jobs,
+        )
+
         pred_loader = DataLoader(
             input_series_dataset,
             batch_size=batch_size,
@@ -912,95 +1143,57 @@ def predict_from_dataset(
             drop_last=False,
             collate_fn=self._batch_collate_fn,
         )
-        predictions = []
-        iterator = _build_tqdm_iterator(pred_loader, verbose=verbose)
-
-        self.model.eval()
-        with torch.no_grad():
-            for batch_tuple in iterator:
-                batch_tuple = self._batch_to_device(batch_tuple)
-                input_data_tuple, batch_input_series = batch_tuple[:-1], batch_tuple[-1]
-
-                # number of individual series to be predicted in current batch
-                num_series = input_data_tuple[0].shape[0]
-
-                # number of of times the input tensor should be tiled to produce predictions for multiple samples
-                # this variable is larger than 1 only if the batch_size is at least twice as large as the number
-                # of individual time series being predicted in current batch (`num_series`)
-                batch_sample_size = min(max(batch_size // num_series, 1), num_samples)
-
-                # counts number of produced prediction samples for every series to be predicted in current batch
-                sample_count = 0
-
-                # repeat prediction procedure for every needed sample
-                batch_predictions = []
-                while sample_count < num_samples:
-
-                    # make sure we don't produce too many samples
-                    if sample_count + batch_sample_size > num_samples:
-                        batch_sample_size = num_samples - sample_count
-
-                    # stack multiple copies of the tensors to produce probabilistic forecasts
-                    input_data_tuple_samples = self._sample_tiling(
-                        input_data_tuple, batch_sample_size
-                    )
-
-                    # get predictions for 1 whole batch (can include predictions of multiple series
-                    # and for multiple samples if a probabilistic forecast is produced)
-                    batch_prediction = self._get_batch_prediction(
-                        n, input_data_tuple_samples, roll_size
-                    )
-
-                    # reshape from 3d tensor (num_series x batch_sample_size, ...)
-                    # into 4d tensor (batch_sample_size, num_series, ...), where dim 0 represents the samples
-                    out_shape = batch_prediction.shape
-                    batch_prediction = batch_prediction.reshape(
-                        (
-                            batch_sample_size,
-                            num_series,
-                        )
-                        + out_shape[1:]
-                    )
-
-                    # save all predictions and update the `sample_count` variable
-                    batch_predictions.append(batch_prediction)
-                    sample_count += batch_sample_size
-
-                # concatenate the batch of samples, to form num_samples samples
-                batch_predictions = torch.cat(batch_predictions, dim=0)
-                batch_predictions = batch_predictions.cpu().detach().numpy()
-
-                # create `TimeSeries` objects from prediction tensors
-                ts_forecasts = Parallel(n_jobs=n_jobs)(
-                    delayed(self._build_forecast_series)(
-                        [
-                            batch_prediction[batch_idx]
-                            for batch_prediction in batch_predictions
-                        ],
-                        input_series,
-                    )
-                    for batch_idx, input_series in enumerate(batch_input_series)
-                )
 
-                predictions.extend(ts_forecasts)
+        if verbose is not None:
+            raise_deprecation_warning(
+                "kwarg `verbose` is deprecated and will be removed in a future Darts version. "
+                "Instead, control verbosity with PyTorch Lightning Trainer parameters `enable_progress_bar`, "
+                "`progress_bar_refresh_rate` and `enable_model_summary` in the `pl_trainer_kwargs` dict "
+                "at model creation.",
+                logger,
+            )
+        verbose = True if verbose is None else verbose
 
-        return predictions
+        # setup trainer. will only be re-instantiated if both `trainer` and `self.trainer` are `None`
+        trainer = trainer if trainer is not None else self.trainer
+        self._setup_trainer(trainer=trainer, verbose=verbose, epochs=self.n_epochs)
 
-    def _sample_tiling(self, input_data_tuple, batch_sample_size):
-        tiled_input_data = []
-        for tensor in input_data_tuple:
-            if tensor is not None:
-                tiled_input_data.append(tensor.tile((batch_sample_size, 1, 1)))
-            else:
-                tiled_input_data.append(None)
-        return tuple(tiled_input_data)
+        # if model checkpoint was loaded without calling fit afterwards (when `load_ckpt_path is not None`),
+        # trainer needs to be instantiated here
+        ckpt_path = self.load_ckpt_path
+        self.load_ckpt_path = None
 
-    def _batch_to_device(self, batch):
-        batch = [
-            elem.to(self.device) if isinstance(elem, torch.Tensor) else elem
-            for elem in batch
-        ]
-        return tuple(batch)
+        # prediction output comes as nested list: list of predicted `TimeSeries` for each batch.
+        predictions = self.trainer.predict(self.model, pred_loader, ckpt_path=ckpt_path)
+        # flatten and return
+        return [ts for batch in predictions for ts in batch]
+
+    @property
+    @abstractmethod
+    def _model_encoder_settings(self) -> Tuple[int, int, bool, bool]:
+        """Abstract property that returns model specific encoder settings that are used to initialize the encoders.
+
+        Must return Tuple (input_chunk_length, output_chunk_length, takes_past_covariates, takes_future_covariates)
+        """
+        pass
+
+    def initialize_encoders(self) -> SequentialEncoder:
+        """instantiates the SequentialEncoder object based on self._model_encoder_settings and parameter
+        ``add_encoders`` used at model creation"""
+        (
+            input_chunk_length,
+            output_chunk_length,
+            takes_past_covariates,
+            takes_future_covariates,
+        ) = self._model_encoder_settings
+
+        return SequentialEncoder(
+            add_encoders=self.add_encoders,
+            input_chunk_length=input_chunk_length,
+            output_chunk_length=output_chunk_length,
+            takes_past_covariates=takes_past_covariates,
+            takes_future_covariates=takes_future_covariates,
+        )
 
     @property
     def first_prediction_index(self) -> int:
@@ -1009,127 +1202,32 @@ def first_prediction_index(self) -> int:
         """
         return 0
 
-    def _train(
-        self,
-        train_loader: DataLoader,
-        val_loader: Optional[DataLoader],
-        tb_writer: Optional[SummaryWriter],
-        verbose: bool,
-        epochs: int = 0,
-    ) -> None:
+    @property
+    def min_train_series_length(self) -> int:
         """
-        Performs the actual training
-        :param train_loader: the training data loader feeding the training data and targets
-        :param val_loader: optionally, a validation set loader
-        :param tb_writer: optionally, a TensorBoard writer
-        :param epochs: value >0 means we're retraining model
+        Class property defining the minimum required length for the training series;
+        overriding the default value of 3 of ForecastingModel
         """
+        return self.input_chunk_length + self.output_chunk_length
 
-        best_loss = np.inf
-
-        iterator = _build_tqdm_iterator(
-            range(self.total_epochs, self.total_epochs + epochs),
-            verbose=verbose,
-        )
-
-        for epoch in iterator:
-            total_loss = 0
-
-            for batch_idx, train_batch in enumerate(train_loader):
-                self.model.train()
-                train_batch = self._batch_to_device(train_batch)
-                output = self._produce_train_output(train_batch[:-1])
-                target = train_batch[
-                    -1
-                ]  # By convention target is always the last element returned by datasets
-                loss = self._compute_loss(output, target)
-                self.optimizer.zero_grad()
-                loss.backward()
-                self.optimizer.step()
-                total_loss += loss.item()
-            if isinstance(
-                self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau
-            ):
-                self.lr_scheduler.step(loss)
-            elif self.lr_scheduler is not None:
-                self.lr_scheduler.step()
-
-            if tb_writer is not None:
-                for name, param in self.model.named_parameters():
-                    # if the param doesn't require gradient, then param.grad = None and param.grad.data will crash
-                    if param.requires_grad:
-                        tb_writer.add_histogram(
-                            name + "/gradients", param.grad.data.cpu().numpy(), epoch
-                        )
-
-                tb_writer.add_scalar(
-                    "training/loss", total_loss / (batch_idx + 1), epoch
-                )
-                tb_writer.add_scalar(
-                    "training/loss_total", total_loss / (batch_idx + 1), epoch
-                )
-                tb_writer.add_scalar(
-                    "training/learning_rate", self._get_learning_rate(), epoch
-                )
-
-            self.total_epochs = epoch + 1
-
-            if self.save_checkpoints:
-                self._save_model_from_fit(
-                    is_best=False,
-                    folder=_get_checkpoint_folder(self.work_dir, self.model_name),
-                    epoch=epoch,
+    @staticmethod
+    def _batch_collate_fn(batch: List[Tuple]) -> Tuple:
+        """
+        Returns a batch Tuple from a list of samples
+        """
+        aggregated = []
+        first_sample = batch[0]
+        for i in range(len(first_sample)):
+            elem = first_sample[i]
+            if isinstance(elem, np.ndarray):
+                aggregated.append(
+                    torch.from_numpy(np.stack([sample[i] for sample in batch], axis=0))
                 )
-
-            if epoch % self.nr_epochs_val_period == 0:
-                training_loss = total_loss / len(train_loader)
-                if val_loader is not None:
-                    validation_loss = self._evaluate_validation_loss(val_loader)
-                    if tb_writer is not None:
-                        tb_writer.add_scalar(
-                            "validation/loss_total", validation_loss, epoch
-                        )
-
-                    if validation_loss < best_loss:
-                        best_loss = validation_loss
-                        if self.save_checkpoints:
-                            self._save_model_from_fit(
-                                is_best=True,
-                                folder=_get_checkpoint_folder(
-                                    self.work_dir, self.model_name
-                                ),
-                                epoch=epoch,
-                            )
-
-                    if verbose:
-                        print(
-                            "Training loss: {:.4f}, validation loss: {:.4f}, best val loss: {:.4f}".format(
-                                training_loss, validation_loss, best_loss
-                            ),
-                            end="\r",
-                        )
-                elif verbose:
-                    print(f"Training loss: {training_loss:.4f}", end="\r")
-
-    def _compute_loss(self, output, target):
-        return self.criterion(output, target)
-
-    def _produce_predict_output(self, input):
-        return self.model(input)
-
-    def _evaluate_validation_loss(self, val_loader: DataLoader):
-        total_loss = 0
-        self.model.eval()
-        with torch.no_grad():
-            for batch_idx, val_batch in enumerate(val_loader):
-                val_batch = self._batch_to_device(val_batch)
-                output = self._produce_train_output(val_batch[:-1])
-                target = val_batch[-1]
-                loss = self._compute_loss(output, target)
-                total_loss += loss.item()
-
-        validation_loss = total_loss / (batch_idx + 1)
-        return validation_loss
+            elif elem is None:
+                aggregated.append(None)
+            elif isinstance(elem, TimeSeries):
+                aggregated.append([sample[i] for sample in batch])
+        return tuple(aggregated)
 
     def save_model(self, path: str) -> None:
         """Saves the model under a given path. The path should end with '.pth.tar'
@@ -1149,48 +1247,6 @@ def save_model(self, path: str) -> None:
         with open(path, "wb") as f_out:
             torch.save(self, f_out)
 
-    def _save_model_from_fit(self, is_best: bool, folder: str, epoch: int) -> None:
-        """
-        Saves the torch model during training at a given epoch to the model's checkpoint folder.
-        Only the latest five save files are kept at most plus an additional save file for the model's best performing
-        state (on validation set).
-        Older save files will be removed.
-
-        Parameters
-        ----------
-        is_best
-            whether the model we're currently saving is the best (on validation set).
-        folder
-            path to the model's checkpoints folder. The folder is usually in the working directory under
-            './.darts/checkpoints/{model_name}'
-        epoch
-            current epoch number
-        """
-
-        checklist = glob(os.path.join(folder, "checkpoint_*"))
-        checklist = sorted(checklist, key=lambda x: float(re.findall(r"(\d+)", x)[-1]))
-        file_name = f"checkpoint_{epoch}.pth.tar"
-        os.makedirs(folder, exist_ok=True)
-        file_path = os.path.join(folder, file_name)
-
-        self.save_model(file_path)
-
-        if len(checklist) >= 5:
-            # remove older files
-            for chkpt in checklist[:-4]:
-                os.remove(chkpt)
-        if is_best:
-            best_path = os.path.join(folder, f"model_best_{epoch}.pth.tar")
-            shutil.copyfile(file_path, best_path)
-            checklist = glob(os.path.join(folder, "model_best_*"))
-            checklist = sorted(
-                checklist, key=lambda x: float(re.findall(r"(\d+)", x)[-1])
-            )
-            if len(checklist) >= 2:
-                # remove older files
-                for chkpt in checklist[:-1]:
-                    os.remove(chkpt)
-
     @staticmethod
     def load_model(path: str) -> "TorchForecastingModel":
         """loads a model from a given file path. The file name should end with '.pth.tar'
@@ -1211,35 +1267,20 @@ def load_model(path: str) -> "TorchForecastingModel":
             model = torch.load(fin)
         return model
 
-    def _prepare_tensorboard_writer(self):
-        runs_folder = _get_runs_folder(self.work_dir, self.model_name)
-        if self.log_tensorboard:
-            if self.total_epochs > 0:
-                tb_writer = SummaryWriter(runs_folder, purge_step=self.total_epochs)
-            else:
-                tb_writer = SummaryWriter(runs_folder)
-                # TODO: implement an abstract method _get_input_dims() which returns input dimensions for
-                # TODO: eahc model type. Then we can restore tensorboard graphs.
-                # dummy_input = torch.empty(self.batch_size, self.input_chunk_length, self.input_dim).to(self.device)
-                # tb_writer.add_graph(self.model, dummy_input)
-        else:
-            tb_writer = None
-        return tb_writer
-
     @staticmethod
     def load_from_checkpoint(
         model_name: str, work_dir: str = None, file_name: str = None, best: bool = True
     ) -> "TorchForecastingModel":
-        """Load the model from the checkpoints saved automatically.
-
-        The checkpoints are saved under ``{work_dir}/checkpoints/{model_name}/``.
-        This method is used for models that were created with `save_checkpoints=True`.
-        If you manually saved your model, consider using :func:`load_model()`.
+        """
+        Load the model from automatically saved checkpoints under '{work_dir}/darts_logs/{model_name}/checkpoints/'.
+        This method is used for models that were created with ``save_checkpoints=True``.
+        If you manually saved your model, consider using :meth:`load_model() <TorchForeCastingModel.load_model()>`.
 
-        If `file_name` is given, returns the model saved under ``{work_dir}/checkpoints/{model_name}/{file_name}``
+        If ``file_name`` is given, returns the model saved under
+        '{work_dir}/darts_logs/{model_name}/checkpoints/{file_name}'.
 
-        If `file_name` is not given, will try to restore the best checkpoint (if `best` is `True`) or the most
-        recent checkpoint (if `best` is `False`) from ``{work_dir}/checkpoints/{model_name}``.
+        If ``file_name`` is not given, will try to restore the best checkpoint (if ``best`` is ``True``) or the most
+        recent checkpoint (if ``best`` is ``False`` from '{work_dir}/darts_logs/{model_name}/checkpoints/'.
 
         Parameters
         ----------
@@ -1250,110 +1291,87 @@ def load_from_checkpoint(
         file_name
             The name of the checkpoint file. If not specified, use the most recent one.
         best
-            If set, will retrieve the best model (according to validation loss) instead of the most recent one.
-            Ignored when `file_name` is given.
+            If set, will retrieve the best model (according to validation loss) instead of the most recent one. Only
+            is ignored when ``file_name`` is given.
 
         Returns
         -------
         TorchForecastingModel
-            The (trained) model instance.
+            The corresponding trained :class:`TorchForecastingModel`.
         """
 
         if work_dir is None:
             work_dir = os.path.join(os.getcwd(), DEFAULT_DARTS_FOLDER)
 
         checkpoint_dir = _get_checkpoint_folder(work_dir, model_name)
+        model_dir = _get_runs_folder(work_dir, model_name)
 
-        # if file_name is none, find most recent file in savepath that is a checkpoint
-        if file_name is None:
-            path = os.path.join(
-                checkpoint_dir, "model_best_*" if best else "checkpoint_*"
-            )
-            checklist = glob(path)
-            if len(checklist) == 0:
-                raise_log(
-                    FileNotFoundError(
-                        "There is no file matching prefix {} in {}".format(
-                            "model_best_*" if best else "checkpoint_*", checkpoint_dir
-                        )
-                    ),
-                    logger,
-                )
-            file_name = max(
-                checklist, key=os.path.getctime
-            )  # latest file TODO: check case where no files match
-            file_name = os.path.basename(file_name)
-
-        file_path = os.path.join(checkpoint_dir, file_name)
-        logger.info(f"loading {file_name}")
-        return TorchForecastingModel.load_model(file_path)
+        # load base TorchForecastingModel saved at model creation
+        base_model_path = os.path.join(model_dir, INIT_MODEL_NAME)
+        raise_if_not(
+            os.path.exists(base_model_path),
+            f"Could not find base model save file `{INIT_MODEL_NAME}` in {model_dir}.",
+            logger,
+        )
 
-    def _get_best_torch_device(self):
-        is_cuda = torch.cuda.is_available()
-        if is_cuda:
-            return torch.device("cuda:0")
-        else:
-            return torch.device("cpu")
+        model = TorchForecastingModel.load_model(base_model_path)
 
-    def _get_learning_rate(self):
-        for p in self.optimizer.param_groups:
-            return p["lr"]
+        # load pytorch lightning module from checkpoint
+        # if file_name is None, find most recent file in savepath that is a checkpoint
+        if file_name is None:
+            file_name = _get_checkpoint_fname(work_dir, model_name, best=best)
 
+        file_path = os.path.join(checkpoint_dir, file_name)
+        logger.info("loading {}".format(file_name))
 
-class TorchParametricProbabilisticForecastingModel(TorchForecastingModel, ABC):
-    def __init__(self, likelihood: Optional[Likelihood] = None, **kwargs):
-        """Pytorch Parametric Probabilistic Forecasting Model.
+        model.model = model.model.__class__.load_from_checkpoint(file_path)
+        model.load_ckpt_path = file_path
+        return model
 
-        This is a base class for pytroch parametric probabilistic models. "Parametric"
-        means that these models are based on some predefined parametric distribution, say Gaussian.
-        Make sure that subclasses contain the *likelihood* parameter in __init__ method
-        and it is passed to the superclass via calling super().__init__. If the likelihood is not
-        provided, the model is considered as deterministic.
+    @property
+    def model_created(self) -> bool:
+        return self.model is not None
 
-        All TorchParametricProbabilisticForecastingModel's must produce outputs of shape
-        (batch_size, n_timesteps, n_components, n_params). I.e., there's an extra dimension
-        to store the distribution's parameters.
+    @property
+    def epochs_trained(self) -> int:
+        return self.model.epochs_trained if self.model_created else 0
 
-        Parameters
-        ----------
-        likelihood
-            The likelihood model to be used for probabilistic forecasts.
-        """
-        super().__init__(**kwargs)
-        self.likelihood = likelihood
+    @property
+    def likelihood(self) -> Likelihood:
+        return (
+            self.model.likelihood
+            if self.model_created
+            else self.pl_module_params.get("likelihood", None)
+        )
 
-    def _is_probabilistic(self):
-        return self.likelihood is not None
+    @property
+    def input_chunk_length(self) -> int:
+        return (
+            self.model.input_chunk_length
+            if self.model_created
+            else self.pl_module_params["input_chunk_length"]
+        )
 
-    def _compute_loss(self, output, target):
-        # output is of shape (batch_size, n_timesteps, n_components, n_params)
-        if self.likelihood:
-            return self.likelihood.compute_loss(output, target)
-        else:
-            # If there's no likelihood, nr_params=1 and we need to squeeze out the
-            # last dimension of model output, for properly computing the loss.
-            return super()._compute_loss(output.squeeze(dim=-1), target)
+    @property
+    def output_chunk_length(self) -> int:
+        return (
+            self.model.output_chunk_length
+            if self.model_created
+            else self.pl_module_params["output_chunk_length"]
+        )
 
-    @abstractmethod
-    def _produce_predict_output(self, x):
-        """
-        This method has to be implemented by all children.
-        """
-        pass
+    def _is_probabilistic(self) -> bool:
+        return (
+            self.model._is_probabilistic()
+            if self.model_created
+            else self.likelihood is not None
+        )
 
 
 def _raise_if_wrong_type(obj, exp_type, msg="expected type {}, got: {}"):
     raise_if_not(isinstance(obj, exp_type), msg.format(exp_type, type(obj)))
 
 
-def _cat_with_optional(tsr1: torch.Tensor, tsr2: Optional[torch.Tensor]):
-    if tsr2 is None:
-        return tsr1
-    else:
-        # dimensions are (batch, length, width), we concatenate along the widths.
-        return torch.cat([tsr1, tsr2], dim=2)
-
-
 """
 Below we define the 5 torch model types:
     * PastCovariatesTorchModel
@@ -1509,106 +1527,6 @@ def _verify_past_future_covariates(self, past_covariates, future_covariates):
             "support only past_covariates.",
         )
 
-    def _produce_train_output(self, input_batch: Tuple):
-        past_target, past_covariate = input_batch
-        # Currently all our PastCovariates models require past target and covariates concatenated
-        inpt = (
-            torch.cat([past_target, past_covariate], dim=2)
-            if past_covariate is not None
-            else past_target
-        )
-        return self.model(inpt)
-
-    def _get_batch_prediction(
-        self, n: int, input_batch: Tuple, roll_size: int
-    ) -> torch.Tensor:
-        """
-        Feeds PastCovariatesTorchModel with input and output chunks of a PastCovariatesSequentialDataset to farecast
-        the next `n` target values per target variable.
-
-        Parameters:
-        ----------
-        n
-            prediction length
-        input_batch
-            (past_target, past_covariates, future_past_covariates)
-        roll_size
-            roll input arrays after every sequence by `roll_size`. Initially, `roll_size` is equivalent to
-            `self.output_chunk_length`
-        """
-        dim_component = 2
-        past_target, past_covariates, future_past_covariates = input_batch
-
-        n_targets = past_target.shape[dim_component]
-        n_past_covs = (
-            past_covariates.shape[dim_component] if past_covariates is not None else 0
-        )
-
-        input_past = torch.cat(
-            [ds for ds in [past_target, past_covariates] if ds is not None],
-            dim=dim_component,
-        )
-
-        out = self._produce_predict_output(input_past)[
-            :, self.first_prediction_index :, :
-        ]
-
-        batch_prediction = [out[:, :roll_size, :]]
-        prediction_length = roll_size
-
-        while prediction_length < n:
-            # we want the last prediction to end exactly at `n` into the future.
-            # this means we may have to truncate the previous prediction and step
-            # back the roll size for the last chunk
-            if prediction_length + self.output_chunk_length > n:
-                spillover_prediction_length = (
-                    prediction_length + self.output_chunk_length - n
-                )
-                roll_size -= spillover_prediction_length
-                prediction_length -= spillover_prediction_length
-                batch_prediction[-1] = batch_prediction[-1][:, :roll_size, :]
-
-            # ==========> PAST INPUT <==========
-            # roll over input series to contain latest target and covariate
-            input_past = torch.roll(input_past, -roll_size, 1)
-
-            # update target input to include next `roll_size` predictions
-            if self.input_chunk_length >= roll_size:
-                input_past[:, -roll_size:, :n_targets] = out[:, :roll_size, :]
-            else:
-                input_past[:, :, :n_targets] = out[:, -self.input_chunk_length :, :]
-
-            # set left and right boundaries for extracting future elements
-            if self.input_chunk_length >= roll_size:
-                left_past, right_past = prediction_length - roll_size, prediction_length
-            else:
-                left_past, right_past = (
-                    prediction_length - self.input_chunk_length,
-                    prediction_length,
-                )
-
-            # update past covariates to include next `roll_size` future past covariates elements
-            if n_past_covs and self.input_chunk_length >= roll_size:
-                input_past[
-                    :, -roll_size:, n_targets : n_targets + n_past_covs
-                ] = future_past_covariates[:, left_past:right_past, :]
-            elif n_past_covs:
-                input_past[
-                    :, :, n_targets : n_targets + n_past_covs
-                ] = future_past_covariates[:, left_past:right_past, :]
-
-            # take only last part of the output sequence where needed
-            out = self._produce_predict_output(input_past)[
-                :, self.first_prediction_index :, :
-            ]
-            batch_prediction.append(out)
-            prediction_length += self.output_chunk_length
-
-        # bring predictions into desired format and drop unnecessary values
-        batch_prediction = torch.cat(batch_prediction, dim=1)
-        batch_prediction = batch_prediction[:, :n, :]
-        return batch_prediction
-
     @property
     def _model_encoder_settings(self) -> Tuple[int, int, bool, bool]:
         input_chunk_length = self.input_chunk_length
@@ -1682,11 +1600,6 @@ def _verify_past_future_covariates(self, past_covariates, future_covariates):
             "support only future_covariates.",
         )
 
-    def _get_batch_prediction(
-        self, n: int, input_batch: Tuple, roll_size: int
-    ) -> Tensor:
-        raise NotImplementedError("TBD: Darts doesn't contain such a model yet.")
-
     @property
     def _model_encoder_settings(self) -> Tuple[int, int, bool, bool]:
         input_chunk_length = self.input_chunk_length
@@ -1753,13 +1666,6 @@ def _verify_past_future_covariates(self, past_covariates, future_covariates):
             "support only future_covariates.",
         )
 
-    def _get_batch_prediction(
-        self, n: int, input_batch: Tuple, roll_size: int
-    ) -> Tensor:
-        raise NotImplementedError(
-            "TBD: The only DualCovariatesModel is an RNN with a specific implementation."
-        )
-
     @property
     def _model_encoder_settings(self) -> Tuple[int, int, bool, bool]:
         input_chunk_length = self.input_chunk_length
@@ -1822,11 +1728,6 @@ def _verify_past_future_covariates(self, past_covariates, future_covariates):
         # both covariates are supported; do nothing
         pass
 
-    def _get_batch_prediction(
-        self, n: int, input_batch: Tuple, roll_size: int
-    ) -> Tensor:
-        raise NotImplementedError("TBD: Darts doesn't contain such a model yet.")
-
     @property
     def _model_encoder_settings(self) -> Tuple[int, int, bool, bool]:
         input_chunk_length = self.input_chunk_length
@@ -1890,11 +1791,6 @@ def _verify_predict_sample(self, predict_sample: Tuple):
         # TODO: we have to check both past and future covariates
         raise NotImplementedError()
 
-    def _get_batch_prediction(
-        self, n: int, input_batch: Tuple, roll_size: int
-    ) -> Tensor:
-        raise NotImplementedError("TBD: Darts doesn't contain such a model yet.")
-
     @property
     def _model_encoder_settings(self) -> Tuple[int, int, bool, bool]:
         input_chunk_length = self.input_chunk_length
diff --git a/darts/models/forecasting/transformer_model.py b/darts/models/forecasting/transformer_model.py
index 18e18b9f37..dc4624e513 100644
--- a/darts/models/forecasting/transformer_model.py
+++ b/darts/models/forecasting/transformer_model.py
@@ -4,19 +4,14 @@
 """
 
 import math
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple
 
 import torch
 import torch.nn as nn
-from numpy.random import RandomState
 
 from darts.logging import get_logger
-from darts.models.forecasting.torch_forecasting_model import (
-    PastCovariatesTorchModel,
-    TorchParametricProbabilisticForecastingModel,
-)
-from darts.utils.likelihood_models import Likelihood
-from darts.utils.torch import random_method
+from darts.models.forecasting.pl_forecasting_module import PLPastCovariatesModule
+from darts.models.forecasting.torch_forecasting_model import PastCovariatesTorchModel
 
 logger = get_logger(__name__)
 
@@ -66,11 +61,9 @@ def forward(self, x):
         return self.dropout(x)
 
 
-class _TransformerModule(nn.Module):
+class _TransformerModule(PLPastCovariatesModule):
     def __init__(
         self,
-        input_chunk_length: int,
-        output_chunk_length: int,
         input_size: int,
         output_size: int,
         nr_params: int,
@@ -83,6 +76,7 @@ def __init__(
         activation: str,
         custom_encoder: Optional[nn.Module] = None,
         custom_decoder: Optional[nn.Module] = None,
+        **kwargs
     ):
         """PyTorch module implementing a Transformer to be used in `TransformerModel`.
 
@@ -92,10 +86,6 @@ def __init__(
         ----------
         input_size
             The dimensionality of the TimeSeries instances that will be fed to the the fit and predict functions.
-        input_chunk_length
-            Number of time steps to be input to the forecasting module.
-        output_chunk_length
-            Number of time steps to be output by the forecasting module.
         output_size
             The dimensionality of the output time series.
         nr_params
@@ -118,6 +108,8 @@ def __init__(
             a custom transformer encoder provided by the user (default=None)
         custom_decoder
             a custom transformer decoder provided by the user (default=None)
+        **kwargs
+            all parameters required for :class:`darts.model.forecasting_models.PLForecastingModule` base class.
 
         Inputs
         ------
@@ -130,16 +122,19 @@ def __init__(
             Tensor containing the prediction at the last time step of the sequence.
         """
 
-        super().__init__()
+        super().__init__(**kwargs)
+
+        # required for all modules -> saves hparams for checkpoints
+        self.save_hyperparameters()
 
         self.input_size = input_size
         self.target_size = output_size
         self.nr_params = nr_params
-        self.target_length = output_chunk_length
+        self.target_length = self.output_chunk_length
 
         self.encoder = nn.Linear(input_size, d_model)
         self.positional_encoding = _PositionalEncoding(
-            d_model, dropout, input_chunk_length
+            d_model, dropout, self.input_chunk_length
         )
 
         # Defining the Transformer module
@@ -156,7 +151,7 @@ def __init__(
         )
 
         self.decoder = nn.Linear(
-            d_model, output_chunk_length * self.target_size * self.nr_params
+            d_model, self.output_chunk_length * self.target_size * self.nr_params
         )
 
     def _create_transformer_inputs(self, data):
@@ -196,10 +191,7 @@ def forward(self, data):
         return predictions
 
 
-class TransformerModel(
-    TorchParametricProbabilisticForecastingModel, PastCovariatesTorchModel
-):
-    @random_method
+class TransformerModel(PastCovariatesTorchModel):
     def __init__(
         self,
         input_chunk_length: int,
@@ -213,8 +205,6 @@ def __init__(
         activation: str = "relu",
         custom_encoder: Optional[nn.Module] = None,
         custom_decoder: Optional[nn.Module] = None,
-        likelihood: Optional[Likelihood] = None,
-        random_state: Optional[Union[int, RandomState]] = None,
         **kwargs
     ):
 
@@ -258,64 +248,43 @@ def __init__(
             a custom user-provided encoder module for the transformer (default=None)
         custom_decoder
             a custom user-provided decoder module for the transformer (default=None)
-        likelihood
-            Optionally, the likelihood model to be used for probabilistic forecasts.
-            If no likelihood model is provided, forecasts will be deterministic.
-        random_state
-            Controls the randomness of the weights initialization. Check this
-            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
-
-        batch_size
-            Number of time series (input and output sequences) used in each training pass.
-        n_epochs
-            Number of epochs over which to train the model.
-        add_encoders
-            A large number of past and future covariates can be automatically generated with `add_encoders`.
-            This can be done by adding mutliple pre-defined index encoders and/or custom user-made functions that
-            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
-            transform the generated covariates. This happens all under one hood and only needs to be specified at
-            model creation.
-            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
-            `add_encoders`. An example showing some of `add_encoders` features:
-
-            .. highlight:: python
-            .. code-block:: python
+        **kwargs
+            Optional arguments to initialize the pytorch_lightning.Module, pytorch_lightning.Trainer, and
+            Darts' :class:`TorchForecastingModel`.
 
-                add_encoders={
-                    'cyclic': {'future': ['month']},
-                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
-                    'position': {'past': ['absolute'], 'future': ['relative']},
-                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
-                    'transformer': Scaler()
-                }
-            ..
+        loss_fn
+            PyTorch loss function used for training.
+            This parameter will be ignored for probabilistic models if the ``likelihood`` parameter is specified.
+            Default: ``torch.nn.MSELoss()``.
+        likelihood
+            The likelihood model to be used for probabilistic forecasts.
         optimizer_cls
-            The PyTorch optimizer class to be used (default: `torch.optim.Adam`).
+            The PyTorch optimizer class to be used (default: ``torch.optim.Adam``).
         optimizer_kwargs
             Optionally, some keyword arguments for the PyTorch optimizer (e.g., ``{'lr': 1e-3}``
-            for specifying a learning rate). Otherwise the default values of the selected `optimizer_cls`
+            for specifying a learning rate). Otherwise the default values of the selected ``optimizer_cls``
             will be used.
         lr_scheduler_cls
-            Optionally, the PyTorch learning rate scheduler class to be used. Specifying `None` corresponds
+            Optionally, the PyTorch learning rate scheduler class to be used. Specifying ``None`` corresponds
             to using a constant learning rate.
         lr_scheduler_kwargs
-            Optionally, some keyword arguments for the PyTorch optimizer.
-        loss_fn
-            PyTorch loss function used for training.
-            This parameter will be ignored for probabilistic models if the `likelihood` parameter is specified.
-            Default: ``torch.nn.MSELoss()``.
+            Optionally, some keyword arguments for the PyTorch learning rate scheduler.
+        batch_size
+            Number of time series (input and output sequences) used in each training pass.
+        n_epochs
+            Number of epochs over which to train the model.
         model_name
             Name of the model. Used for creating checkpoints and saving tensorboard data. If not specified,
-            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part of
-            the name is formatted with the local date and time, while PID is the processed ID (preventing models spawned
-            at the same time by different processes to share the same model_name). E.g.,
+            defaults to the following string ``"YYYY-mm-dd_HH:MM:SS_torch_model_run_PID"``, where the initial part
+            of the name is formatted with the local date and time, while PID is the processed ID (preventing models
+            spawned at the same time by different processes to share the same model_name). E.g.,
             ``"2021-06-14_09:53:32_torch_model_run_44607"``.
         work_dir
             Path of the working directory, where to save checkpoints and Tensorboard summaries.
             (default: current working directory).
         log_tensorboard
             If set, use Tensorboard to log the different parameters. The logs will be located in:
-            `[work_dir]/.darts/runs/`.
+            ``"{work_dir}/darts_logs/{model_name}/logs/"``.
         nr_epochs_val_period
             Number of epochs to wait before evaluating the validation loss (if a validation
             ``TimeSeries`` is passed to the :func:`fit()` method).
@@ -323,12 +292,74 @@ def __init__(
             Optionally, a string indicating the torch device to use. (default: "cuda:0" if a GPU
             is available, otherwise "cpu")
         force_reset
-            If set to `True`, any previously-existing model with the same name will be reset (all checkpoints will
+            If set to ``True``, any previously-existing model with the same name will be reset (all checkpoints will
             be discarded).
         save_checkpoints
             Whether or not to automatically save the untrained model and checkpoints from training.
-            If set to `False`, the model can still be manually saved using :func:`save_model()`
-            and loaded using :func:`load_model()`.
+            To load the model from checkpoint, call :func:`MyModelClass.load_from_checkpoint()`, where
+            :class:`MyModelClass` is the :class:`TorchForecastingModel` class that was used (such as :class:`TFTModel`,
+            :class:`NBEATSModel`, etc.). If set to ``False``, the model can still be manually saved using
+            :func:`save_model()` and loaded using :func:`load_model()`.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
+        random_state
+            Control the randomness of the weights initialization. Check this
+            `link <https://scikit-learn.org/stable/glossary.html#term-random_state>`_ for more details.
+        pl_trainer_kwargs
+            By default :class:`TorchForecastingModel` creates a PyTorch Lightning Trainer with several useful presets
+            that performs the training, validation and prediction processes. These presets include automatic
+            checkpointing, tensorboard logging, setting the torch device and more.
+            With ``pl_trainer_kwargs`` you can add additional kwargs to instantiate the PyTorch Lightning trainer
+            object. Check the `PL Trainer documentation
+            <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`_ for more information about the
+            supported kwargs.
+            With parameter ``"callbacks"`` you can add custom or PyTorch-Lightning built-in callbacks to Darts'
+            :class:`TorchForecastingModel`. Below is an example for adding EarlyStopping to the training process.
+            The model will stop training early if the validation loss `val_loss` does not improve beyond
+            specifications. For more information on callbacks, visit:
+            `PyTorch Lightning Callbacks
+            <https://pytorch-lightning.readthedocs.io/en/stable/extensions/callbacks.html>`_
+
+            .. highlight:: python
+            .. code-block:: python
+
+                from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+                # stop training when validation loss does not decrease more than 0.05 (`min_delta`) over
+                # a period of 5 epochs (`patience`)
+                my_stopper = EarlyStopping(
+                    monitor="val_loss",
+                    patience=5,
+                    min_delta=0.05,
+                    mode='min',
+                )
+
+                pl_trainer_kwargs={"callbacks": [my_stopper]}
+            ..
+
+            Note that you can also use a custom PyTorch Lightning Trainer for training and prediction with optional
+            parameter ``trainer`` in :func:`fit()` and :func:`predict()`.
+        show_warnings
+            whether to show warnings raised from PyTorch Lightning. Useful to detect potential issues of
+            your forecasting use case.
 
         References
         ----------
@@ -347,13 +378,11 @@ def __init__(
         sequences of values, the input to the `tgt` argument would grow as outputs of the transformer model would be
         added to it. Of course, the training of the model would have to be adapted accordingly.
         """
+        super().__init__(**self._extract_torch_model_params(**self.model_params))
 
-        kwargs["input_chunk_length"] = input_chunk_length
-        kwargs["output_chunk_length"] = output_chunk_length
-        super().__init__(likelihood=likelihood, **kwargs)
+        # extract pytorch lightning module kwargs
+        self.pl_module_params = self._extract_pl_module_params(**self.model_params)
 
-        self.input_chunk_length = input_chunk_length
-        self.output_chunk_length = output_chunk_length
         self.d_model = d_model
         self.nhead = nhead
         self.num_encoder_layers = num_encoder_layers
@@ -373,8 +402,6 @@ def _create_model(self, train_sample: Tuple[torch.Tensor]) -> torch.nn.Module:
         nr_params = 1 if self.likelihood is None else self.likelihood.num_parameters
 
         return _TransformerModule(
-            input_chunk_length=self.input_chunk_length,
-            output_chunk_length=self.output_chunk_length,
             input_size=input_dim,
             output_size=output_dim,
             nr_params=nr_params,
@@ -387,12 +414,5 @@ def _create_model(self, train_sample: Tuple[torch.Tensor]) -> torch.nn.Module:
             activation=self.activation,
             custom_encoder=self.custom_encoder,
             custom_decoder=self.custom_decoder,
+            **self.pl_module_params,
         )
-
-    @random_method
-    def _produce_predict_output(self, x):
-        if self.likelihood:
-            output = self.model(x)
-            return self.likelihood.sample(output)
-        else:
-            return self.model(x).squeeze(dim=-1)
diff --git a/darts/tests/models/forecasting/test_NBEATS.py b/darts/tests/models/forecasting/test_NBEATS.py
index 5b3d871f79..25ee03af0a 100644
--- a/darts/tests/models/forecasting/test_NBEATS.py
+++ b/darts/tests/models/forecasting/test_NBEATS.py
@@ -1,3 +1,5 @@
+import shutil
+import tempfile
 import numpy as np
 
 from darts.logging import get_logger
@@ -18,6 +20,12 @@
 if TORCH_AVAILABLE:
 
     class NBEATSModelTestCase(DartsBaseTestClass):
+        def setUp(self):
+            self.temp_work_dir = tempfile.mkdtemp(prefix="darts")
+
+        def tearDown(self):
+            shutil.rmtree(self.temp_work_dir)
+
         def test_creation(self):
             with self.assertRaises(ValueError):
                 # if a list is passed to the `layer_widths` argument, it must have a length equal to `num_stacks`
@@ -106,6 +114,7 @@ def test_logtensorboard(self):
                     output_chunk_length=1,
                     n_epochs=1,
                     log_tensorboard=True,
+                    work_dir=self.temp_work_dir,
                     generic_architecture=architecture,
                 )
                 model.fit(ts)
diff --git a/darts/tests/models/forecasting/test_block_RNN.py b/darts/tests/models/forecasting/test_block_RNN.py
index 86f7c72a27..7d2e123cdb 100644
--- a/darts/tests/models/forecasting/test_block_RNN.py
+++ b/darts/tests/models/forecasting/test_block_RNN.py
@@ -1,3 +1,5 @@
+import shutil
+import tempfile
 import pandas as pd
 
 from darts import TimeSeries
@@ -25,6 +27,7 @@ class BlockRNNModelTestCase(DartsBaseTestClass):
         module = _BlockRNNModule(
             "RNN",
             input_size=1,
+            input_chunk_length=1,
             output_chunk_length=1,
             hidden_dim=25,
             target_size=1,
@@ -34,6 +37,12 @@ class BlockRNNModelTestCase(DartsBaseTestClass):
             dropout=0,
         )
 
+        def setUp(self):
+            self.temp_work_dir = tempfile.mkdtemp(prefix="darts")
+
+        def tearDown(self):
+            shutil.rmtree(self.temp_work_dir)
+
         def test_creation(self):
             with self.assertRaises(ValueError):
                 # cannot choose any string
@@ -61,13 +70,17 @@ def test_fit(self):
                 input_chunk_length=1,
                 output_chunk_length=1,
                 model="LSTM",
-                n_epochs=3,
+                n_epochs=1,
                 model_name="unittest-model-lstm",
+                work_dir=self.temp_work_dir,
                 save_checkpoints=True,
+                force_reset=True,
             )
             model2.fit(self.series)
             model_loaded = model2.load_from_checkpoint(
-                model_name="unittest-model-lstm", best=False
+                model_name="unittest-model-lstm",
+                work_dir=self.temp_work_dir,
+                best=False,
             )
             pred1 = model2.predict(n=6)
             pred2 = model_loaded.predict(n=6)
diff --git a/darts/tests/models/forecasting/test_ensemble_models.py b/darts/tests/models/forecasting/test_ensemble_models.py
index e5ceb267e9..96a17854fc 100644
--- a/darts/tests/models/forecasting/test_ensemble_models.py
+++ b/darts/tests/models/forecasting/test_ensemble_models.py
@@ -84,13 +84,13 @@ def test_predict_ensemble_local_models(self):
 
     @unittest.skipUnless(TORCH_AVAILABLE, "requires torch")
     def test_input_models_global_models(self):
-        NaiveEnsembleModel([RNNModel(), TCNModel(10, 2), NBEATSModel(10, 2)])
+        NaiveEnsembleModel([RNNModel(12), TCNModel(10, 2), NBEATSModel(10, 2)])
 
     @unittest.skipUnless(TORCH_AVAILABLE, "requires torch")
     def test_call_predict_global_models_univariate_input_no_covariates(self):
         naive_ensemble = NaiveEnsembleModel(
             [
-                RNNModel(n_epochs=1),
+                RNNModel(12, n_epochs=1),
                 TCNModel(10, 2, n_epochs=1),
                 NBEATSModel(10, 2, n_epochs=1),
             ]
@@ -105,7 +105,7 @@ def test_call_predict_global_models_univariate_input_no_covariates(self):
     def test_call_predict_global_models_multivariate_input_no_covariates(self):
         naive_ensemble = NaiveEnsembleModel(
             [
-                RNNModel(n_epochs=1),
+                RNNModel(12, n_epochs=1),
                 TCNModel(10, 2, n_epochs=1),
                 NBEATSModel(10, 2, n_epochs=1),
             ]
@@ -117,7 +117,7 @@ def test_call_predict_global_models_multivariate_input_no_covariates(self):
     def test_call_predict_global_models_multivariate_input_with_covariates(self):
         naive_ensemble = NaiveEnsembleModel(
             [
-                RNNModel(n_epochs=1),
+                RNNModel(12, n_epochs=1),
                 TCNModel(10, 2, n_epochs=1),
                 NBEATSModel(10, 2, n_epochs=1),
             ]
@@ -132,7 +132,7 @@ def test_call_predict_global_models_multivariate_input_with_covariates(self):
     @unittest.skipUnless(TORCH_AVAILABLE, "requires torch")
     def test_input_models_mixed(self):
         with self.assertRaises(ValueError):
-            NaiveEnsembleModel([NaiveDrift(), Theta(), RNNModel()])
+            NaiveEnsembleModel([NaiveDrift(), Theta(), RNNModel(12)])
 
     def test_fit_multivar_ts_with_local_models(self):
         naive = NaiveEnsembleModel(
diff --git a/darts/tests/models/forecasting/test_global_forecasting_models.py b/darts/tests/models/forecasting/test_global_forecasting_models.py
index f453c60cdd..23e8da6785 100644
--- a/darts/tests/models/forecasting/test_global_forecasting_models.py
+++ b/darts/tests/models/forecasting/test_global_forecasting_models.py
@@ -1,3 +1,4 @@
+from copy import deepcopy
 from unittest.mock import ANY, patch
 
 import numpy as np
@@ -212,7 +213,6 @@ def test_multi_ts(self):
 
         def test_covariates(self):
             for model_cls, kwargs, err in models_cls_kwargs_errs:
-
                 model = model_cls(
                     input_chunk_length=IN_LEN,
                     output_chunk_length=OUT_LEN,
@@ -229,7 +229,6 @@ def test_covariates(self):
                 cov_kwargs = {
                     cov_name: [self.time_covariates_train, self.time_covariates_train]
                 }
-
                 model.fit(
                     series=[self.ts_pass_train, self.ts_pass_train_1], **cov_kwargs
                 )
@@ -312,7 +311,7 @@ def test_future_covariates(self):
                 model.predict(n=166, series=self.ts_pass_train)
 
             # recurrent models can only predict data points for time steps where future covariates are available
-            model = RNNModel(n_epochs=1)
+            model = RNNModel(12, n_epochs=1)
             model.fit(series=self.target_past, future_covariates=self.covariates_past)
             model.predict(n=160, future_covariates=self.covariates)
             with self.assertRaises(ValueError):
@@ -356,7 +355,6 @@ def _batch_prediction_test_helper_function(self, targets):
                     past_covariates=[self.covariates] * len(targets),
                     batch_size=batch_size,
                 )
-
                 for i in range(len(targets)):
                     self.assertLess(
                         sum(sum((preds[i] - preds_default[i]).values())), epsilon
@@ -381,7 +379,6 @@ def test_prediction_with_different_n(self):
                 model = model_cls(
                     input_chunk_length=IN_LEN, output_chunk_length=OUT_LEN, **kwargs
                 )
-
                 self.assertTrue(
                     isinstance(
                         model,
@@ -420,29 +417,35 @@ def test_same_result_with_different_n_jobs(self):
                 model = model_cls(
                     input_chunk_length=IN_LEN, output_chunk_length=OUT_LEN, **kwargs
                 )
-                if model._is_probabilistic():
-                    continue
+
                 multiple_ts = [self.ts_pass_train] * 10
 
                 model.fit(multiple_ts)
 
+                # safe random state for two successive identical predictions
+                if model._is_probabilistic():
+                    random_state = deepcopy(model._random_instance)
+                else:
+                    random_state = None
+
                 pred1 = model.predict(n=36, series=multiple_ts, n_jobs=1)
+
+                if random_state is not None:
+                    model._random_instance = random_state
+
                 pred2 = model.predict(
                     n=36, series=multiple_ts, n_jobs=-1
                 )  # assuming > 1 core available in the machine
-
                 self.assertEqual(
                     pred1,
                     pred2,
                     "Model {} produces different predictions with different number of jobs",
                 )
 
-        @patch("darts.models.forecasting.torch_forecasting_model.torch.save")
         @patch(
-            "darts.models.forecasting.torch_forecasting_model.TorchForecastingModel._train"
+            "darts.models.forecasting.torch_forecasting_model.TorchForecastingModel._init_trainer"
         )
-        @patch("darts.models.forecasting.torch_forecasting_model.shutil.rmtree")
-        def test_fit_with_constr_epochs(self, rmtree_patch, train_patch, save_patch):
+        def test_fit_with_constr_epochs(self, init_trainer):
             for model_cls, kwargs, err in models_cls_kwargs_errs:
                 model = model_cls(
                     input_chunk_length=IN_LEN, output_chunk_length=OUT_LEN, **kwargs
@@ -450,14 +453,14 @@ def test_fit_with_constr_epochs(self, rmtree_patch, train_patch, save_patch):
                 multiple_ts = [self.ts_pass_train] * 10
                 model.fit(multiple_ts)
 
-                train_patch.assert_called_with(ANY, ANY, ANY, ANY, kwargs["n_epochs"])
+                init_trainer.assert_called_with(
+                    max_epochs=kwargs["n_epochs"], trainer_params=ANY
+                )
 
-        @patch("darts.models.forecasting.torch_forecasting_model.torch.save")
         @patch(
-            "darts.models.forecasting.torch_forecasting_model.TorchForecastingModel._train"
+            "darts.models.forecasting.torch_forecasting_model.TorchForecastingModel._init_trainer"
         )
-        @patch("darts.models.forecasting.torch_forecasting_model.shutil.rmtree")
-        def test_fit_with_fit_epochs(self, rmtree_patch, train_patch, save_patch):
+        def test_fit_with_fit_epochs(self, init_trainer):
             for model_cls, kwargs, err in models_cls_kwargs_errs:
                 model = model_cls(
                     input_chunk_length=IN_LEN, output_chunk_length=OUT_LEN, **kwargs
@@ -466,23 +469,17 @@ def test_fit_with_fit_epochs(self, rmtree_patch, train_patch, save_patch):
                 epochs = 42
 
                 model.fit(multiple_ts, epochs=epochs)
-
-                train_patch.assert_called_with(ANY, ANY, ANY, ANY, epochs)
+                init_trainer.assert_called_with(max_epochs=epochs, trainer_params=ANY)
 
                 model.total_epochs = epochs
                 # continue training
                 model.fit(multiple_ts, epochs=epochs)
+                init_trainer.assert_called_with(max_epochs=epochs, trainer_params=ANY)
 
-                train_patch.assert_called_with(ANY, ANY, ANY, ANY, epochs)
-
-        @patch("darts.models.forecasting.torch_forecasting_model.torch.save")
         @patch(
-            "darts.models.forecasting.torch_forecasting_model.TorchForecastingModel._train"
+            "darts.models.forecasting.torch_forecasting_model.TorchForecastingModel._init_trainer"
         )
-        @patch("darts.models.forecasting.torch_forecasting_model.shutil.rmtree")
-        def test_fit_from_dataset_with_epochs(
-            self, rmtree_patch, train_patch, save_patch
-        ):
+        def test_fit_from_dataset_with_epochs(self, init_trainer):
             for model_cls, kwargs, err in models_cls_kwargs_errs:
                 model = model_cls(
                     input_chunk_length=IN_LEN, output_chunk_length=OUT_LEN, **kwargs
@@ -497,14 +494,11 @@ def test_fit_from_dataset_with_epochs(
                 epochs = 42
 
                 model.fit_from_dataset(train_dataset, epochs=epochs)
+                init_trainer.assert_called_with(max_epochs=epochs, trainer_params=ANY)
 
-                train_patch.assert_called_with(ANY, ANY, ANY, ANY, epochs)
-
-                model.total_epochs = epochs
                 # continue training
                 model.fit_from_dataset(train_dataset, epochs=epochs)
-
-                train_patch.assert_called_with(ANY, ANY, ANY, ANY, epochs)
+                init_trainer.assert_called_with(max_epochs=epochs, trainer_params=ANY)
 
         def test_sample_smaller_than_batch_size(self):
             """
diff --git a/darts/tests/models/forecasting/test_local_forecasting_models.py b/darts/tests/models/forecasting/test_local_forecasting_models.py
index eb2c32eecd..12c7f66bbd 100644
--- a/darts/tests/models/forecasting/test_local_forecasting_models.py
+++ b/darts/tests/models/forecasting/test_local_forecasting_models.py
@@ -118,7 +118,9 @@ class LocalForecastingModelsTestCase(DartsBaseTestClass):
     def test_save_model_parameters(self):
         # model creation parameters were saved before. check if re-created model has same params as original
         for model, _ in models:
-            self.assertTrue(model._model_params, model.untrained_model()._model_params)
+            self.assertTrue(
+                model._model_params == model.untrained_model()._model_params
+            )
 
     def test_models_runnability(self):
         for model, _ in models:
diff --git a/darts/tests/models/forecasting/test_ptl_trainer.py b/darts/tests/models/forecasting/test_ptl_trainer.py
new file mode 100644
index 0000000000..752d2cb2d3
--- /dev/null
+++ b/darts/tests/models/forecasting/test_ptl_trainer.py
@@ -0,0 +1,222 @@
+import shutil
+import tempfile
+
+import numpy as np
+import pytorch_lightning as pl
+
+from darts.logging import get_logger
+from darts.tests.base_test_class import DartsBaseTestClass
+from darts.utils.timeseries_generation import linear_timeseries
+
+logger = get_logger(__name__)
+
+try:
+    from darts.models.forecasting.rnn_model import RNNModel
+
+    TORCH_AVAILABLE = True
+except ImportError:
+    logger.warning("Torch not available. RNN tests will be skipped.")
+    TORCH_AVAILABLE = False
+
+
+if TORCH_AVAILABLE:
+
+    class TestTorchForecastingModel(DartsBaseTestClass):
+        trainer_params = {
+            "max_epochs": 1,
+            "logger": False,
+            "enable_checkpointing": False,
+        }
+
+        series = linear_timeseries(length=100).astype(np.float32)
+
+        def setUp(self):
+            self.temp_work_dir = tempfile.mkdtemp(prefix="darts")
+
+        def tearDown(self):
+            shutil.rmtree(self.temp_work_dir)
+
+        def test_prediction_loaded_custom_trainer(self):
+            """validate manual save with automatic save files by comparing output between the two"""
+            auto_name = "test_save_automatic"
+            model = RNNModel(
+                12,
+                "RNN",
+                10,
+                10,
+                model_name=auto_name,
+                work_dir=self.temp_work_dir,
+                save_checkpoints=True,
+                random_state=42,
+            )
+
+            # fit model with custom trainer
+            trainer = pl.Trainer(
+                max_epochs=1,
+                enable_checkpointing=True,
+                logger=False,
+                callbacks=model.trainer_params["callbacks"],
+                precision=32,
+            )
+            model.fit(self.series, trainer=trainer)
+
+            # load automatically saved model with manual load_model() and load_from_checkpoint()
+            model_loaded = RNNModel.load_from_checkpoint(
+                model_name=auto_name, work_dir=self.temp_work_dir, best=False
+            )
+
+            # compare prediction of loaded model with original model
+            self.assertEqual(model.predict(n=4), model_loaded.predict(n=4))
+
+        def test_prediction_custom_trainer(self):
+            model = RNNModel(12, "RNN", 10, 10, random_state=42)
+            model2 = RNNModel(12, "RNN", 10, 10, random_state=42)
+
+            # fit model with custom trainer
+            trainer = pl.Trainer(**self.trainer_params, precision=32)
+            model.fit(self.series, trainer=trainer)
+
+            # fit model with built-in trainer
+            model2.fit(self.series, epochs=1)
+
+            # both should produce identical prediction
+            self.assertEqual(model.predict(n=4), model2.predict(n=4))
+
+        def test_custom_trainer_setup(self):
+            model = RNNModel(12, "RNN", 10, 10, random_state=42)
+
+            # trainer with wrong precision should raise ValueError
+            trainer = pl.Trainer(**self.trainer_params, precision=64)
+            with self.assertRaises(ValueError):
+                model.fit(self.series, trainer=trainer)
+
+            # no error with correct precision
+            trainer = pl.Trainer(**self.trainer_params, precision=32)
+            model.fit(self.series, trainer=trainer)
+
+            # check if number of epochs trained is same as trainer.max_epochs
+            self.assertEqual(trainer.max_epochs, model.epochs_trained)
+
+        def test_builtin_extended_trainer(self):
+            invalid_trainer_kwarg = {"precisionn": 32}
+
+            # error will be raised at training time
+            with self.assertRaises(TypeError):
+                model = RNNModel(
+                    12,
+                    "RNN",
+                    10,
+                    10,
+                    random_state=42,
+                    pl_trainer_kwargs=invalid_trainer_kwarg,
+                )
+                model.fit(self.series, epochs=1)
+
+            valid_trainer_kwargs = {
+                "precision": 32,
+            }
+
+            # valid parameters shouldn't raise error
+            model = RNNModel(
+                12,
+                "RNN",
+                10,
+                10,
+                random_state=42,
+                pl_trainer_kwargs=valid_trainer_kwargs,
+            )
+            model.fit(self.series, epochs=1)
+
+        def test_custom_callback(self):
+            class CounterCallback(pl.callbacks.Callback):
+                # counts the number of trained epochs starting from count_default
+                def __init__(self, count_default):
+                    self.counter = count_default
+
+                def on_train_epoch_end(self, *args, **kwargs):
+                    self.counter += 1
+
+            my_counter_0 = CounterCallback(count_default=0)
+            my_counter_2 = CounterCallback(count_default=2)
+
+            model = RNNModel(
+                12,
+                "RNN",
+                10,
+                10,
+                random_state=42,
+                pl_trainer_kwargs={"callbacks": [my_counter_0, my_counter_2]},
+            )
+
+            # check if callbacks were added
+            self.assertEqual(len(model.trainer_params["callbacks"]), 2)
+            model.fit(self.series, epochs=2)
+
+            self.assertEqual(my_counter_0.counter, model.epochs_trained)
+            self.assertEqual(my_counter_2.counter, model.epochs_trained + 2)
+
+            # check that callbacks don't overwrite Darts' built-in checkpointer
+            model = RNNModel(
+                12,
+                "RNN",
+                10,
+                10,
+                random_state=42,
+                work_dir=self.temp_work_dir,
+                save_checkpoints=True,
+                pl_trainer_kwargs={
+                    "callbacks": [CounterCallback(0), CounterCallback(2)]
+                },
+            )
+            # we expect 3 callbacks
+            self.assertEqual(len(model.trainer_params["callbacks"]), 3)
+
+            # first one is our Checkpointer
+            self.assertTrue(
+                isinstance(
+                    model.trainer_params["callbacks"][0], pl.callbacks.ModelCheckpoint
+                )
+            )
+
+            # second and third are CounterCallbacks
+            for i in range(1, 3):
+                self.assertTrue(
+                    isinstance(model.trainer_params["callbacks"][i], CounterCallback)
+                )
+
+        def test_early_stopping(self):
+            my_stopper = pl.callbacks.early_stopping.EarlyStopping(
+                monitor="val_loss",
+                stopping_threshold=1e9,
+            )
+            model = RNNModel(
+                12,
+                "RNN",
+                10,
+                10,
+                nr_epochs_val_period=1,
+                random_state=42,
+                pl_trainer_kwargs={"callbacks": [my_stopper]},
+            )
+
+            # training should stop immediately with high stopping_threshold
+            model.fit(self.series, val_series=self.series, epochs=100, verbose=True)
+            self.assertEqual(model.epochs_trained, 1)
+
+            # check that early stopping only takes valid monitor variables
+            my_stopper = pl.callbacks.early_stopping.EarlyStopping(
+                monitor="invalid_variable",
+                stopping_threshold=1e9,
+            )
+            model = RNNModel(
+                12,
+                "RNN",
+                10,
+                10,
+                nr_epochs_val_period=1,
+                random_state=42,
+                pl_trainer_kwargs={"callbacks": [my_stopper]},
+            )
+
+            with self.assertRaises(RuntimeError):
+                model.fit(self.series, val_series=self.series, epochs=100, verbose=True)
diff --git a/darts/tests/models/forecasting/test_torch_forecasting_model.py b/darts/tests/models/forecasting/test_torch_forecasting_model.py
index 214ded15e3..fce67ac42e 100644
--- a/darts/tests/models/forecasting/test_torch_forecasting_model.py
+++ b/darts/tests/models/forecasting/test_torch_forecasting_model.py
@@ -31,7 +31,7 @@ def tearDown(self):
 
         def test_save_model_parameters(self):
             # check if re-created model has same params as original
-            model = RNNModel("RNN", 10, 10)
+            model = RNNModel(12, "RNN", 10, 10)
             self.assertTrue(model._model_params, model.untrained_model()._model_params)
 
         @patch(
@@ -40,6 +40,7 @@ def test_save_model_parameters(self):
         def test_suppress_automatic_save(self, patch_save_model):
             model_name = "test_model"
             model1 = RNNModel(
+                12,
                 "RNN",
                 10,
                 10,
@@ -48,6 +49,7 @@ def test_suppress_automatic_save(self, patch_save_model):
                 save_checkpoints=False,
             )
             model2 = RNNModel(
+                12,
                 "RNN",
                 10,
                 10,
@@ -77,6 +79,7 @@ def test_manual_save_and_load(self):
             manual_name = "test_save_manual"
             auto_name = "test_save_automatic"
             model_manual_save = RNNModel(
+                12,
                 "RNN",
                 10,
                 10,
@@ -86,6 +89,7 @@ def test_manual_save_and_load(self):
                 random_state=42,
             )
             model_auto_save = RNNModel(
+                12,
                 "RNN",
                 10,
                 10,
@@ -102,15 +106,19 @@ def test_manual_save_and_load(self):
             model_manual_save.fit(series, epochs=1)
             model_auto_save.fit(series, epochs=1)
 
-            checkpoints_dir = os.path.join(self.temp_work_dir, "checkpoints")
+            model_dir = os.path.join(self.temp_work_dir)
 
             # check that file was not created with manual save
-            self.assertFalse(os.path.exists(os.path.join(checkpoints_dir, manual_name)))
+            self.assertFalse(
+                os.path.exists(os.path.join(model_dir, manual_name, "checkpoints"))
+            )
             # check that file was created with automatic save
-            self.assertTrue(os.path.exists(os.path.join(checkpoints_dir, auto_name)))
+            self.assertTrue(
+                os.path.exists(os.path.join(model_dir, auto_name, "checkpoints"))
+            )
 
             # create manually saved model checkpoints folder
-            checkpoint_path_manual = os.path.join(checkpoints_dir, manual_name)
+            checkpoint_path_manual = os.path.join(model_dir, manual_name)
             os.mkdir(checkpoint_path_manual)
 
             # save manually saved model
@@ -128,35 +136,31 @@ def test_manual_save_and_load(self):
             )
 
             # load automatically saved model with manual load_model() and load_from_checkpoint()
-            model_path_automatic = os.path.join(
-                checkpoints_dir, auto_name, checkpoint_file_name
-            )
-            model_auto_save1 = RNNModel.load_model(model_path_automatic)
-
-            model_auto_save2 = RNNModel.load_from_checkpoint(
+            model_auto_save1 = RNNModel.load_from_checkpoint(
                 model_name=auto_name, work_dir=self.temp_work_dir, best=False
             )
 
-            # compare manual load with manual save
+            # compare loaded checkpoint with manual save
             self.assertEqual(
                 model_manual_save.predict(n=4), model_auto_save1.predict(n=4)
             )
-            self.assertEqual(
-                model_manual_save.predict(n=4), model_auto_save2.predict(n=4)
-            )
 
         def test_create_instance_new_model_no_name_set(self):
-            RNNModel("RNN", 10, 10, work_dir=self.temp_work_dir)
+            RNNModel(12, "RNN", 10, 10, work_dir=self.temp_work_dir)
             # no exception is raised
-            RNNModel("RNN", 10, 10, work_dir=self.temp_work_dir)
+            RNNModel(12, "RNN", 10, 10, work_dir=self.temp_work_dir)
             # no exception is raised
 
         def test_create_instance_existing_model_with_name_no_fit(self):
             model_name = "test_model"
-            RNNModel("RNN", 10, 10, work_dir=self.temp_work_dir, model_name=model_name)
+            RNNModel(
+                12, "RNN", 10, 10, work_dir=self.temp_work_dir, model_name=model_name
+            )
             # no exception is raised
 
-            RNNModel("RNN", 10, 10, work_dir=self.temp_work_dir, model_name=model_name)
+            RNNModel(
+                12, "RNN", 10, 10, work_dir=self.temp_work_dir, model_name=model_name
+            )
             # no exception is raised
 
         @patch(
@@ -166,11 +170,14 @@ def test_create_instance_existing_model_with_name_force(
             self, patch_reset_model
         ):
             model_name = "test_model"
-            RNNModel("RNN", 10, 10, work_dir=self.temp_work_dir, model_name=model_name)
+            RNNModel(
+                12, "RNN", 10, 10, work_dir=self.temp_work_dir, model_name=model_name
+            )
             # no exception is raised
             # since no fit, there is no data stored for the model, hence `force_reset` does noting
 
             RNNModel(
+                12,
                 "RNN",
                 10,
                 10,
@@ -188,6 +195,7 @@ def test_create_instance_existing_model_with_name_force_fit_with_reset(
         ):
             model_name = "test_model"
             model1 = RNNModel(
+                12,
                 "RNN",
                 10,
                 10,
@@ -203,6 +211,7 @@ def test_create_instance_existing_model_with_name_force_fit_with_reset(
             model1.fit(series, epochs=1)
 
             RNNModel(
+                12,
                 "RNN",
                 10,
                 10,
@@ -213,64 +222,66 @@ def test_create_instance_existing_model_with_name_force_fit_with_reset(
             )
             patch_reset_model.assert_called_once()
 
-        # n_epochs=20, fit|epochs=None, total_epochs=0 - train for 20 epochs
+        # TODO for PTL: currently we (have to (?)) create a mew PTL trainer object every time fit() is called which
+        #  resets some of the model's attributes such as epoch and step counts. We have check whether there is another
+        #  way of doing this.
+
+        # n_epochs=20, fit|epochs=None, epochs_trained=0 - train for 20 epochs
         def test_train_from_0_n_epochs_20_no_fit_epochs(self):
-            model1 = RNNModel("RNN", 10, 10, n_epochs=20, work_dir=self.temp_work_dir)
+            model1 = RNNModel(
+                12, "RNN", 10, 10, n_epochs=20, work_dir=self.temp_work_dir
+            )
 
             times = pd.date_range("20130101", "20130410")
             pd_series = pd.Series(range(100), index=times)
             series = TimeSeries.from_series(pd_series)
             model1.fit(series)
 
-            self.assertEqual(model1.total_epochs, 20)
+            self.assertEqual(20, model1.epochs_trained)
 
-        # n_epochs = 20, fit|epochs=None, total_epochs=20 - train for another 20 epochs
+        # n_epochs = 20, fit|epochs=None, epochs_trained=20 - train for another 20 epochs
         def test_train_from_20_n_epochs_40_no_fit_epochs(self):
-            model1 = RNNModel("RNN", 10, 10, n_epochs=20, work_dir=self.temp_work_dir)
+            model1 = RNNModel(
+                12, "RNN", 10, 10, n_epochs=20, work_dir=self.temp_work_dir
+            )
 
             times = pd.date_range("20130101", "20130410")
             pd_series = pd.Series(range(100), index=times)
             series = TimeSeries.from_series(pd_series)
             model1.fit(series)
-            self.assertEqual(model1.total_epochs, 20)
+            self.assertEqual(20, model1.epochs_trained)
 
             model1.fit(series)
-            self.assertEqual(model1.total_epochs, 40)
+            self.assertEqual(20, model1.epochs_trained)
 
-        # n_epochs = 20, fit|epochs=None, total_epochs=10 - train for another 20 epochs
+        # n_epochs = 20, fit|epochs=None, epochs_trained=10 - train for another 20 epochs
         def test_train_from_10_n_epochs_20_no_fit_epochs(self):
-            model1 = RNNModel("RNN", 10, 10, n_epochs=20, work_dir=self.temp_work_dir)
+            model1 = RNNModel(
+                12, "RNN", 10, 10, n_epochs=20, work_dir=self.temp_work_dir
+            )
 
             times = pd.date_range("20130101", "20130410")
             pd_series = pd.Series(range(100), index=times)
             series = TimeSeries.from_series(pd_series)
             # simulate the case that user interrupted training with Ctrl-C after 10 epochs
             model1.fit(series, epochs=10)
-            self.assertEqual(model1.total_epochs, 10)
+            self.assertEqual(10, model1.epochs_trained)
 
             model1.fit(series)
-            self.assertEqual(model1.total_epochs, 30)
-
-        # n_epochs = 20, fit|epochs=15, total_epochs=0 - train for 15 epochs
-        def test_train_from_0_n_epochs_20_fit_15_epochs(self):
-            model1 = RNNModel("RNN", 10, 10, n_epochs=20, work_dir=self.temp_work_dir)
+            self.assertEqual(20, model1.epochs_trained)
 
-            times = pd.date_range("20130101", "20130410")
-            pd_series = pd.Series(range(100), index=times)
-            series = TimeSeries.from_series(pd_series)
-            model1.fit(series, epochs=15)
-            self.assertEqual(model1.total_epochs, 15)
-
-        # n_epochs = 20, fit|epochs=15, total_epochs=10 - train for 15 epochs
+        # n_epochs = 20, fit|epochs=15, epochs_trained=10 - train for 15 epochs
         def test_train_from_10_n_epochs_20_fit_15_epochs(self):
-            model1 = RNNModel("RNN", 10, 10, n_epochs=20, work_dir=self.temp_work_dir)
+            model1 = RNNModel(
+                12, "RNN", 10, 10, n_epochs=20, work_dir=self.temp_work_dir
+            )
 
             times = pd.date_range("20130101", "20130410")
             pd_series = pd.Series(range(100), index=times)
             series = TimeSeries.from_series(pd_series)
             # simulate the case that user interrupted training with Ctrl-C after 10 epochs
             model1.fit(series, epochs=10)
-            self.assertEqual(model1.total_epochs, 10)
+            self.assertEqual(10, model1.epochs_trained)
 
             model1.fit(series, epochs=15)
-            self.assertEqual(model1.total_epochs, 25)
+            self.assertEqual(15, model1.epochs_trained)
diff --git a/darts/tests/models/forecasting/test_transformer_model.py b/darts/tests/models/forecasting/test_transformer_model.py
index 0cca81d2c5..36879ab829 100644
--- a/darts/tests/models/forecasting/test_transformer_model.py
+++ b/darts/tests/models/forecasting/test_transformer_model.py
@@ -1,3 +1,5 @@
+import shutil
+import tempfile
 import pandas as pd
 
 from darts import TimeSeries
@@ -44,6 +46,12 @@ class TransformerModelTestCase(DartsBaseTestClass):
             custom_decoder=None,
         )
 
+        def setUp(self):
+            self.temp_work_dir = tempfile.mkdtemp(prefix="darts")
+
+        def tearDown(self):
+            shutil.rmtree(self.temp_work_dir)
+
         def test_fit(self):
             # Test fit-save-load cycle
             model2 = TransformerModel(
@@ -51,11 +59,15 @@ def test_fit(self):
                 output_chunk_length=1,
                 n_epochs=2,
                 model_name="unittest-model-transformer",
+                work_dir=self.temp_work_dir,
                 save_checkpoints=True,
+                force_reset=True,
             )
             model2.fit(self.series)
             model_loaded = model2.load_from_checkpoint(
-                model_name="unittest-model-transformer", best=False
+                model_name="unittest-model-transformer",
+                work_dir=self.temp_work_dir,
+                best=False,
             )
             pred1 = model2.predict(n=6)
             pred2 = model_loaded.predict(n=6)
diff --git a/darts/utils/timeseries_generation.py b/darts/utils/timeseries_generation.py
index 90a8b72cd2..7793254a58 100644
--- a/darts/utils/timeseries_generation.py
+++ b/darts/utils/timeseries_generation.py
@@ -686,3 +686,46 @@ def datetime_attribute_timeseries(
     values_df.index = time_index
 
     return TimeSeries.from_dataframe(values_df).astype(dtype)
+
+
+def _build_forecast_series(
+    points_preds: Union[np.ndarray, Sequence[np.ndarray]],
+    input_series: TimeSeries,
+) -> TimeSeries:
+    """
+    Builds a forecast time series starting after the end of an input time series, with the
+    correct time index (or after the end of the input series, if specified).
+    """
+    time_index_length = (
+        len(points_preds)
+        if isinstance(points_preds, np.ndarray)
+        else len(points_preds[0])
+    )
+    time_index = _generate_new_dates(time_index_length, input_series=input_series)
+    if isinstance(points_preds, np.ndarray):
+        return TimeSeries.from_times_and_values(
+            time_index,
+            points_preds,
+            freq=input_series.freq_str,
+            columns=input_series.columns,
+        )
+
+    return TimeSeries.from_times_and_values(
+        time_index,
+        np.stack(points_preds, axis=2),
+        freq=input_series.freq_str,
+        columns=input_series.columns,
+    )
+
+
+def _generate_new_dates(
+    n: int, input_series: TimeSeries
+) -> Union[pd.DatetimeIndex, pd.RangeIndex]:
+    """
+    Generates `n` new dates after the end of the specified series
+    """
+    last = input_series.end_time()
+    start = last + input_series.freq if input_series.has_datetime_index else last + 1
+    return _generate_index(
+        start=start, freq=input_series.freq, length=n, name=input_series.time_dim
+    )
diff --git a/docs/userguide/covariates.md b/docs/userguide/covariates.md
index 998df6fc55..14469b09c6 100644
--- a/docs/userguide/covariates.md
+++ b/docs/userguide/covariates.md
@@ -230,7 +230,7 @@ Depending on your forecast horizon `n`, the model can either predict in one go,
   - `past_covariates`: **at least** the same time span as `target` plus the next `n - output_chunk_length` time steps after the end of `target`
   - `future_covariates`: **at least** the same time span as `target` plus the next `n` time steps after the end of `target`
 
-If you want to know more details about how covariates are used behind the scenes in Global Forecasting Models, read our [guide on Torch Forecasting Models](https://github.com/unit8co/darts/blob/master/doc/torch_forecasting_models.md) (PyTorch based GFMs). It gives a step-by-step explanation of the training and prediction process using one of our Torch Forecasting Models.
+If you want to know more details about how covariates are used behind the scenes in Global Forecasting Models, read our [guide on Torch Forecasting Models](https://unit8co.github.io/darts/userguide/torch_forecasting_models.html) (PyTorch based GFMs). It gives a step-by-step explanation of the training and prediction process using one of our Torch Forecasting Models.
 
 ## 2.4. Examples
 We have lots of great examples showcasing how to use covariates with Darts' forecasting models.
diff --git a/docs/userguide/torch_forecasting_models.md b/docs/userguide/torch_forecasting_models.md
index c10dbae28b..d88c49ef69 100644
--- a/docs/userguide/torch_forecasting_models.md
+++ b/docs/userguide/torch_forecasting_models.md
@@ -1,7 +1,7 @@
 # In-depth look at Torch Forecasting Models
 This document was written for darts version 0.15.0.
 
-We assume that you already know about covariates in Darts. If you're new to the topic we recommend you to read our [guide on covariates](https://github.com/unit8co/darts/blob/master/doc/covariates.md) first.
+We assume that you already know about covariates in Darts. If you're new to the topic we recommend you to read our [guide on covariates](https://unit8co.github.io/darts/userguide/covariates.html) first.
 
 ## Content of this document
 
@@ -108,11 +108,11 @@ You can use the same covariates series for both `fit()` and `predict()` if they
 
 **Training** only works if at least one sample with an input and output chunk can be extracted from the data you passed to `fit()`. This applies both to training and validation data. In terms of minimum required time spans, this means:
 - `target` series of minimum length `input_chunk_length + output_chunk_length`
-- `*_covariates` time span requirements for `fit()` from [covariates guide section 2.3.](https://github.com/unit8co/darts/blob/master/doc/covariates.md#global-forecasting-models-gfms-1)
+- `*_covariates` time span requirements for `fit()` from [covariates guide section 2.3.](https://unit8co.github.io/darts/userguide/covariates.html#id6)
 
 For **prediction** you have to supply the `target` series that you wish to forecast. For any forecast horizon `n` the minimum time span requirements are:
 - `target` series of minimum length `input_chunk_length`
-- `*_covariates` time span requirements for `predict()` also from from [covariates guide section 2.3.](https://github.com/unit8co/darts/blob/master/doc/covariates.md#global-forecasting-models-gfms-1)
+- `*_covariates` time span requirements for `predict()` also from from [covariates guide section 2.3.](https://unit8co.github.io/darts/userguide/covariates.html#id6)
 
 Side note: Our `*RNNModels` accept a `training_length` parameter at model creation instead of `output_chunk_length`. Internally the `output_chunk_length` for these models is automatically set to `1`. For training, past `target` must have a minimum length of `training_length + 1` and for prediction, a length of `input_chunk_length`.
 
diff --git a/requirements/torch.txt b/requirements/torch.txt
index a6207fb73c..621887fd74 100644
--- a/requirements/torch.txt
+++ b/requirements/torch.txt
@@ -1,2 +1,2 @@
-tensorboard>=2.4.0
+pytorch-lightning>=1.5.0
 torch>=1.8.0