diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000..fa9471a4
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+ignore = E226, E501, E741, E743, C901, W503, E203
+max-line-length = 127
+exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,examples,tmp
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
new file mode 100644
index 00000000..070f2557
--- /dev/null
+++ b/.github/workflows/lint.yaml
@@ -0,0 +1,35 @@
+name: Check coding style
+
+on:
+  push:
+    branches:
+      - main
+      - develop
+  pull_request:
+    branches:
+      - main
+      - develop
+
+jobs:
+  black:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Black Check
+        uses: psf/black@stable
+        with:
+          version: "22.3.0"
+
+  flake8:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+      - name: Install flake8
+        run: |
+          pip install flake8==4.0.1
+      - name: run flake8
+        run: |
+          flake8 . --count --show-source --statistics
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
new file mode 100644
index 00000000..e1ee36ad
--- /dev/null
+++ b/.github/workflows/release.yaml
@@ -0,0 +1,29 @@
+name: Upload Python Package
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine
+    - name: Build and publish
+      env:
+        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        python setup.py sdist bdist_wheel
+        twine upload dist/*
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1f48c043..43e560dc 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: Check Syntax and Run Tests
+name: Run Tests
 
 on:
   push:
@@ -15,8 +15,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.9]
-        torch-version: [1.8.0, 1.10.0]
+        python-version: [3.7, 3.9]
+        torch-version: [1.8.0, 1.11.0]
 
     steps:
     - uses: actions/checkout@v2
@@ -24,20 +24,15 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install flake8
-      run: |
-        pip install flake8
-    - name: Lint with flake8
-      run: |
-        flake8 . --count --show-source --statistics
     - name: Install dependencies
       env:
         TORCH: "${{ matrix.torch-version }}"
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       run: |
         python -m pip install --upgrade pip
+        pip install setuptools wheel
         pip install torch==${TORCH} -f https://download.pytorch.org/whl/cpu/torch_stable.html
-        pip install .
+        pip install  --upgrade-strategy only-if-needed .
     - name: Install pytest
       run: |
         pip install pytest
diff --git a/.github/workflows/tests_develop.yml b/.github/workflows/tests_develop.yml
index 66a732b1..bae5795e 100644
--- a/.github/workflows/tests_develop.yml
+++ b/.github/workflows/tests_develop.yml
@@ -1,4 +1,4 @@
-name: Check Syntax and Run Tests
+name: Run Tests
 
 on:
   push:
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.9]
-        torch-version: [1.10.0]
+        torch-version: [1.11.0]
 
     steps:
     - uses: actions/checkout@v2
@@ -24,20 +24,15 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install flake8
-      run: |
-        pip install flake8
-    - name: Lint with flake8
-      run: |
-        flake8 . --count --show-source --statistics
     - name: Install dependencies
       env:
         TORCH: "${{ matrix.torch-version }}"
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       run: |
         python -m pip install --upgrade pip
+        pip install setuptools wheel
         pip install torch==${TORCH} -f https://download.pytorch.org/whl/cpu/torch_stable.html
-        pip install .
+        pip install --upgrade-strategy only-if-needed .
     - name: Install pytest
       run: |
         pip install pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..3a5f9bb9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,14 @@
+exclude: '.git|.tox'
+default_stages: [commit]
+fail_fast: true
+
+repos:
+  - repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+      - id: black
+
+  - repo: https://gitlab.com/pycqa/flake8
+    rev: 4.0.1
+    hooks:
+      - id: flake8
diff --git a/CHANGELOG.md b/CHANGELOG.md
index df941fe3..c98cb711 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Most recent change on the bottom.
 
 
-## [Unreleased]
+## [Unreleased] - 0.5.4
+
+## [0.5.4] - 2022-04-12
+### Added
+- `NequIPCalculator` now handles per-atom energies
+- Added `initial_model_state_strict` YAML option
+- `load_model_state` builder
+- fusion strategy support
+- `cumulative_wall` for early stopping
+- Deploy model from YAML file directly
+
+### Changed
+- Disallow PyTorch 1.9, which has some JIT bugs.
+- `nequip-deploy build` now requires `--train-dir` option when specifying the training session
+- Minimum Python version is now 3.7
+
+### Fixed
+- Better error in `Dataset.statistics` when field is missing
+- `NequIPCalculator` now outputs energy as scalar rather than `(1, 1)` array
+- `dataset: ase` now treats automatically adds `key_mapping` keys to `include_keys`, which is consistant with the npz dataset
+- fixed reloading models with `per_species_rescale_scales/shifts` set to `null`/`None`
+- graceful exit for `-n 0` in `nequip-benchmark`
+- Strictly correct CSV headers for metrics (#198)
 
 ## [0.5.3] - 2022-02-23
 ### Added
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5aa0f34a..ca9826e2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,6 +19,7 @@ We use the [`black`](https://black.readthedocs.io/en/stable/index.html) code for
 ```
 
 Please run the formatter before you commit and certainly before you make a PR. The formatter can be easily set up to run automatically on file save in various editors.
+You can also use ``pre-commit install`` to install a [pre-commit](https://pre-commit.com/) hook.
 
 ## CUDA support
 
diff --git a/README.md b/README.md
index b8feca2d..8bccc76a 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ NequIP is an open-source code for building E(3)-equivariant interatomic potentia
 NequIP requires:
 
 * Python >= 3.6
-* PyTorch >= 1.8, <=1.11.*. PyTorch can be installed following the [instructions from their documentation](https://pytorch.org/get-started/locally/). Note that neither `torchvision` nor `torchaudio`, included in the default install command, are needed for NequIP.
+* PyTorch >= 1.8, !=1.9, <=1.11.*. PyTorch can be installed following the [instructions from their documentation](https://pytorch.org/get-started/locally/). Note that neither `torchvision` nor `torchaudio`, included in the default install command, are needed for NequIP.
 
 To install:
 
@@ -96,7 +96,7 @@ The `nequip-deploy` command is used to deploy the result of a training session i
 It compiles a NequIP model trained in Python to [TorchScript](https://pytorch.org/docs/stable/jit.html).
 The result is an optimized model file that has no dependency on the `nequip` Python library, or even on Python itself:
 ```bash
-nequip-deploy build path/to/training/session/ where/to/put/deployed_model.pth
+nequip-deploy build --train-dir path/to/training/session/ where/to/put/deployed_model.pth
 ```
 For more details on this command, please run `nequip-deploy --help`.
 
diff --git a/configs/full.yaml b/configs/full.yaml
index 64451518..7c9e5507 100644
--- a/configs/full.yaml
+++ b/configs/full.yaml
@@ -189,7 +189,7 @@ early_stopping_lower_bounds:
   LR: 1.0e-5
 
 early_stopping_upper_bounds:                                                       # stop early if a metric value is higher than the bound
-  wall: 1.0e+100
+  cumulative_wall: 1.0e+100
 
 # loss function
 loss_coeffs:                                                                       # different weights to use in a weighted loss functions
diff --git a/examples/lj/README.md b/examples/lj/README.md
new file mode 100644
index 00000000..424cbfb9
--- /dev/null
+++ b/examples/lj/README.md
@@ -0,0 +1,10 @@
+Run commands with
+```
+PYTHONPATH=`pwd`:$PYTHONPATH nequip-* ...
+```
+so that the model from `lj.py` can be imported.
+
+For example, to create a deployed LJ model `lj.pth`:
+```bash
+PYTHONPATH=`pwd`:$PYTHONPATH nequip-deploy build --model lj.yaml lj.pth
+```
\ No newline at end of file
diff --git a/examples/lj/lj.py b/examples/lj/lj.py
new file mode 100644
index 00000000..0bc02f63
--- /dev/null
+++ b/examples/lj/lj.py
@@ -0,0 +1,109 @@
+"""Example implementation of a Lennard-Jones potential in the NequIP framework.
+
+This serves as a basic example of how to write a NequIP framework model from scratch.
+"""
+
+from typing import Union
+
+import torch
+
+from torch_runstats.scatter import scatter
+
+from nequip.data import AtomicDataDict
+from nequip.nn import GraphModuleMixin, SequentialGraphNetwork, AtomwiseReduce
+
+
+# First, we define a model module to do the actual computation:
+class LennardJonesModule(GraphModuleMixin, torch.nn.Module):
+    """NequIP model module implementing a Lennard-Jones potential term.
+
+    See, for example, `lj/cut` in LAMMPS:
+    https://docs.lammps.org/pair_lj.html
+
+    Args:
+        initial_epsilon: initial value of the epsilon parameter.
+        initial_sigma: initial value of the sigma parameter.
+        trainable: whether epsilon and sigma should be trainable.
+            Default False.
+    """
+
+    def __init__(
+        self,
+        initial_epsilon: Union[float, torch.Tensor],
+        initial_sigma: Union[float, torch.Tensor],
+        trainable: bool = False,
+        irreps_in=None,
+    ) -> None:
+        super().__init__()
+        # We have to tell `GraphModuleMixin` what fields we expect in the input and output
+        # and what their irreps will be. Having basic geometry information (positions and edges)
+        # in the input is assumed.
+        # Per-atom energy is a scalar, so 0e.
+        self._init_irreps(irreps_out={AtomicDataDict.PER_ATOM_ENERGY_KEY: "0e"})
+        self.trainable = trainable
+        eps = torch.as_tensor(initial_epsilon)
+        sigma = torch.as_tensor(initial_sigma)
+        assert eps.ndim == sigma.ndim == 0, "epsilon and sigma must be scalars"
+        if self.trainable:
+            self.epsilon = torch.nn.Parameter(eps)
+            self.sigma = torch.nn.Parameter(sigma)
+        else:
+            # buffers act like parameters, but are not trainable
+            self.register_buffer("epsilon", eps)
+            self.register_buffer("sigma", sigma)
+
+    def forward(self, data: AtomicDataDict.Type) -> AtomicDataDict.Type:
+        """Run the module.
+
+        The module both takes and returns an `AtomicDataDict.Type` = `Dict[str, torch.Tensor]`.
+        Keys that the module does not modify/add are expected to be propagated to the output unchanged.
+        """
+        # If they are not already present, compute and add the edge vectors and lengths to `data`:
+        data = AtomicDataDict.with_edge_vectors(data, with_lengths=True)
+        # compute the LJ energy:
+        lj_eng = (self.sigma / data[AtomicDataDict.EDGE_LENGTH_KEY]) ** 6.0
+        lj_eng = torch.neg(lj_eng)
+        lj_eng = lj_eng + lj_eng.square()
+        # 2.0 because we do the slightly wastefull symmetric thing and let
+        # ij and ji each contribute half
+        # this avoids indexing out certain edges in the general case where
+        # the edges are not ordered.
+        lj_eng = (2.0 * self.epsilon) * lj_eng
+        # assign halves to centers
+        atomic_eng = scatter(
+            lj_eng,
+            # the edge indexes are of shape [2, n_edge];
+            # edge_index[0] is the index of the central atom of each edge
+            data[AtomicDataDict.EDGE_INDEX_KEY][0],
+            dim=0,
+            # dim_size says that even if some atoms have no edges, we still
+            # want an output energy for them (it will be zero)
+            dim_size=len(data[AtomicDataDict.POSITIONS_KEY]),
+        )
+        # NequIP defines standardized keys for typical fields:
+        data[AtomicDataDict.PER_ATOM_ENERGY_KEY] = atomic_eng
+        return data
+
+
+# then, we define a *model builder* function that builds an LJ energy model
+# from this and other modules:
+def LennardJonesPotential(config) -> SequentialGraphNetwork:
+    # `from_parameters` builds a model containing each of these modules in sequence
+    # from a configuration `config`
+    return SequentialGraphNetwork.from_parameters(
+        shared_params=config,
+        layers={
+            # LennardJonesModule will be built using options from `config`
+            "lj": LennardJonesModule,
+            # AtomwiseReduce will be built using the provided default parameters,
+            # and also those from `config`.
+            "total_energy_sum": (
+                AtomwiseReduce,
+                dict(
+                    reduce="sum",
+                    field=AtomicDataDict.PER_ATOM_ENERGY_KEY,
+                    out_field=AtomicDataDict.TOTAL_ENERGY_KEY,
+                ),
+            ),
+        },
+    )
diff --git a/examples/lj/lj.yaml b/examples/lj/lj.yaml
new file mode 100644
index 00000000..ecd07b84
--- /dev/null
+++ b/examples/lj/lj.yaml
@@ -0,0 +1,15 @@
+# model
+model_builders:
+ - lj.LennardJonesPotential
+ # LennardJonesPotential gives an energy model
+ # ForceOutput takes an energy model and wraps it with an
+ # autodifferentiation call to get forces too:
+ - ForceOutput
+
+initial_epsilon: 1
+initial_sigma: 1
+
+# system
+r_max: 4.0
+chemical_symbols:
+  - H
diff --git a/examples/monkeypatch.py b/examples/monkeypatch.py
index 1234054f..052e3a94 100644
--- a/examples/monkeypatch.py
+++ b/examples/monkeypatch.py
@@ -5,18 +5,17 @@
 convolution for later analysis.
 """
 
-import torch
-
 from nequip.utils import Config, find_first_of_type
 from nequip.data import AtomicDataDict, AtomicData, dataset_from_config
 from nequip.nn import SequentialGraphNetwork, SaveForOutput
+from nequip.train import Trainer
 
 # The path to the original training session
 path = "../results/aspirin/minimal"
 
 # Load the model
-model = torch.load(path + "/best_model.pth")
-
+# there are other ways to do this, such as model_from_config etc.
+model = Trainer.load_model_from_training_session(traindir=path)
 
 # Find the SequentialGraphNetwork, which contains the
 # sequential bulk of the NequIP GNN model. To see the
diff --git a/nequip/_version.py b/nequip/_version.py
index e99ddf2f..44e13b18 100644
--- a/nequip/_version.py
+++ b/nequip/_version.py
@@ -2,4 +2,4 @@
 # See Python packaging guide
 # https://packaging.python.org/guides/single-sourcing-package-version/
 
-__version__ = "0.5.3"
+__version__ = "0.5.4"
diff --git a/nequip/ase/nequip_calculator.py b/nequip/ase/nequip_calculator.py
index ef14e196..2f4d0aba 100644
--- a/nequip/ase/nequip_calculator.py
+++ b/nequip/ase/nequip_calculator.py
@@ -24,7 +24,7 @@ class NequIPCalculator(Calculator):
 
     """
 
-    implemented_properties = ["energy", "forces"]
+    implemented_properties = ["energy", "energies", "forces"]
 
     def __init__(
         self,
@@ -39,6 +39,9 @@ def __init__(
         Calculator.__init__(self, **kwargs)
         self.results = {}
         self.model = model
+        assert isinstance(
+            model, torch.nn.Module
+        ), "To build a NequIPCalculator from a deployed model, use NequIPCalculator.from_deployed_model"
         self.r_max = r_max
         self.device = device
         self.energy_units_to_eV = energy_units_to_eV
@@ -113,7 +116,9 @@ def calculate(self, atoms=None, properties=["energy"], system_changes=all_change
         # predict + extract data
         out = self.model(data)
         forces = out[AtomicDataDict.FORCE_KEY].detach().cpu().numpy()
-        energy = out[AtomicDataDict.TOTAL_ENERGY_KEY].detach().cpu().numpy()
+        energy = (
+            out[AtomicDataDict.TOTAL_ENERGY_KEY].detach().cpu().numpy().reshape(tuple())
+        )
 
         # store results
         self.results = {
@@ -121,3 +126,12 @@ def calculate(self, atoms=None, properties=["energy"], system_changes=all_change
             # force has units eng / len:
             "forces": forces * (self.energy_units_to_eV / self.length_units_to_A),
         }
+
+        if AtomicDataDict.PER_ATOM_ENERGY_KEY in out:
+            self.results["energies"] = self.energy_units_to_eV * (
+                out[AtomicDataDict.PER_ATOM_ENERGY_KEY]
+                .detach()
+                .squeeze(-1)
+                .cpu()
+                .numpy()
+            )
diff --git a/nequip/ase/nosehoover.py b/nequip/ase/nosehoover.py
index 04827870..5f9a2de8 100644
--- a/nequip/ase/nosehoover.py
+++ b/nequip/ase/nosehoover.py
@@ -100,7 +100,7 @@ def step(self):
 
         nvt_bath_halfstep = self.nvt_bath + 0.5 * self.dt * e_kin_diff / self.nvt_q
         e_kin_diff_halfstep = 0.5 * (
-            np.sum(masses * np.sum(vel_halfstep ** 2, axis=1))
+            np.sum(masses * np.sum(vel_halfstep**2, axis=1))
             - (3 * self.natoms + 1) * units.kB * self.temp
         )
         self.nvt_bath = (
diff --git a/nequip/data/AtomicData.py b/nequip/data/AtomicData.py
index 4848b041..c7fce266 100644
--- a/nequip/data/AtomicData.py
+++ b/nequip/data/AtomicData.py
@@ -369,7 +369,10 @@ def from_ase(
             + list(kwargs.keys())
         )
         # the keys that are duplicated in kwargs are removed from the include_keys
-        include_keys = list(set(include_keys + ase_all_properties) - default_args)
+        include_keys = list(
+            set(include_keys + ase_all_properties + list(key_mapping.keys()))
+            - default_args
+        )
 
         km = {
             "forces": AtomicDataDict.FORCE_KEY,
diff --git a/nequip/data/dataset.py b/nequip/data/dataset.py
index 9d62b20f..f95465bf 100644
--- a/nequip/data/dataset.py
+++ b/nequip/data/dataset.py
@@ -422,6 +422,10 @@ def statistics(
                 assert arr_is_per in ("node", "graph", "edge")
             else:
                 # Give a better error
+                if field not in ff_transformed and field not in data_transformed:
+                    raise RuntimeError(
+                        f"Field `{field}` for which statistics were requested not found in data."
+                    )
                 if field not in selectors:
                     # this means field is not selected and so not available
                     raise RuntimeError(
diff --git a/nequip/model/__init__.py b/nequip/model/__init__.py
index debbe3c8..d34e385a 100644
--- a/nequip/model/__init__.py
+++ b/nequip/model/__init__.py
@@ -1,7 +1,11 @@
 from ._eng import EnergyModel, SimpleIrrepsConfig
 from ._grads import ForceOutput, PartialForceOutput
 from ._scaling import RescaleEnergyEtc, PerSpeciesRescale
-from ._weight_init import uniform_initialize_FCs, initialize_from_state
+from ._weight_init import (
+    uniform_initialize_FCs,
+    initialize_from_state,
+    load_model_state,
+)
 
 from ._build import model_from_config
 
@@ -16,6 +20,7 @@
     PerSpeciesRescale,
     uniform_initialize_FCs,
     initialize_from_state,
+    load_model_state,
     model_from_config,
     builder_utils,
 ]
diff --git a/nequip/model/_build.py b/nequip/model/_build.py
index 3fbc59da..0fe4e21d 100644
--- a/nequip/model/_build.py
+++ b/nequip/model/_build.py
@@ -2,8 +2,9 @@
 from typing import Optional
 
 from nequip.data import AtomicDataset
+from nequip.data.transforms import TypeMapper
 from nequip.nn import GraphModuleMixin
-from nequip.utils import load_callable
+from nequip.utils import load_callable, instantiate
 
 
 def model_from_config(
@@ -26,17 +27,26 @@ def model_from_config(
         The build model.
     """
     # Pre-process config
-    if initialize and dataset is not None:
+    type_mapper = None
+    if dataset is not None:
+        type_mapper = dataset.type_mapper
+    else:
+        try:
+            type_mapper, _ = instantiate(TypeMapper, all_args=config)
+        except RuntimeError:
+            pass
+
+    if type_mapper is not None:
         if "num_types" in config:
             assert (
-                config["num_types"] == dataset.type_mapper.num_types
+                config["num_types"] == type_mapper.num_types
             ), "inconsistant config & dataset"
         if "type_names" in config:
             assert (
-                config["type_names"] == dataset.type_mapper.type_names
+                config["type_names"] == type_mapper.type_names
             ), "inconsistant config & dataset"
-        config["num_types"] = dataset.type_mapper.num_types
-        config["type_names"] = dataset.type_mapper.type_names
+        config["num_types"] = type_mapper.num_types
+        config["type_names"] = type_mapper.type_names
 
     # Build
     builders = [
diff --git a/nequip/model/_scaling.py b/nequip/model/_scaling.py
index 03ce4f79..37f360ad 100644
--- a/nequip/model/_scaling.py
+++ b/nequip/model/_scaling.py
@@ -226,13 +226,12 @@ def PerSpeciesRescale(
             )
 
     else:
-
         # Put dummy values
         # the real ones will be loaded from the state dict later
         # note that the state dict includes buffers,
         # so this is fine regardless of whether its trainable.
-        scales = 1.0
-        shifts = 0.0
+        scales = 1.0 if scales is not None else None
+        shifts = 0.0 if shifts is not None else None
         # values correctly scaled according to where the come from
         # will be brought from the state dict later,
         # so what you set this to doesnt matter:
diff --git a/nequip/model/_weight_init.py b/nequip/model/_weight_init.py
index 60783be3..7d6184c4 100644
--- a/nequip/model/_weight_init.py
+++ b/nequip/model/_weight_init.py
@@ -11,16 +11,45 @@
 
 # == Load old state ==
 def initialize_from_state(config: Config, model: GraphModuleMixin, initialize: bool):
-    """Initialize the model from the state dict file given by the config options `initial_model_state`."""
+    """Initialize the model from the state dict file given by the config options `initial_model_state`.
+
+    Only loads the state dict if `initialize` is `True`; this is meant for, say, starting a training from a previous state.
+
+    If `initial_model_state_strict` controls
+    > whether to strictly enforce that the keys in state_dict
+    > match the keys returned by this module's state_dict() function
+
+    See https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict.
+    """
     if not initialize:
         return model  # do nothing
-    key = "initial_model_state"
-    if key not in config:
+    return load_model_state(
+        config=config, model=model, initialize=initialize, _prefix="initial_model_state"
+    )
+
+
+def load_model_state(
+    config: Config,
+    model: GraphModuleMixin,
+    initialize: bool,
+    _prefix: str = "load_model_state",
+):
+    """Load the model from the state dict file given by the config options `load_model_state`.
+
+    Loads the state dict always; this is meant, for example, for building a new model to deploy with a given state dict.
+
+    If `load_model_state_strict` controls
+    > whether to strictly enforce that the keys in state_dict
+    > match the keys returned by this module's state_dict() function
+
+    See https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict.
+    """
+    if _prefix not in config:
         raise KeyError(
-            f"initialize_from_state requires the `{key}` option specifying the state to initialize from"
+            f"initialize_from_state requires the `{_prefix}` option specifying the state to initialize from"
         )
-    state = torch.load(config[key])
-    model.load_state_dict(state)
+    state = torch.load(config[_prefix])
+    model.load_state_dict(state, strict=config.get(_prefix + "_strict", True))
     return model
 
 
diff --git a/nequip/nn/_interaction_block.py b/nequip/nn/_interaction_block.py
index 6f70af20..99b3acc6 100644
--- a/nequip/nn/_interaction_block.py
+++ b/nequip/nn/_interaction_block.py
@@ -173,7 +173,7 @@ def forward(self, data: AtomicDataDict.Type) -> AtomicDataDict.Type:
         # Necessary to get TorchScript to be able to type infer when its not None
         avg_num_neigh: Optional[float] = self.avg_num_neighbors
         if avg_num_neigh is not None:
-            x = x.div(avg_num_neigh ** 0.5)
+            x = x.div(avg_num_neigh**0.5)
 
         x = self.linear_2(x)
 
diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py
index f0803ff0..327f9016 100644
--- a/nequip/scripts/benchmark.py
+++ b/nequip/scripts/benchmark.py
@@ -90,6 +90,10 @@ def main(args=None):
 
     datas = itertools.cycle(datas)
 
+    if args.n == 0:
+        print("Got -n 0, so quitting without running benchmark.")
+        return
+
     # Load model:
     print("Loading model... ")
     model = model_from_config(config, initialize=True, dataset=dataset)
diff --git a/nequip/scripts/deploy.py b/nequip/scripts/deploy.py
index 736de9a4..48cc187e 100644
--- a/nequip/scripts/deploy.py
+++ b/nequip/scripts/deploy.py
@@ -9,6 +9,7 @@
 import pathlib
 import logging
 import warnings
+import yaml
 
 # This is a weird hack to avoid Intel MKL issues on the cluster when this is called as a subprocess of a process that has itself initialized PyTorch.
 # Since numpy gets imported later anyway for dataset stuff, this shouldn't affect performance.
@@ -20,7 +21,8 @@
 
 from e3nn.util.jit import script
 
-from nequip.scripts.train import _set_global_options
+from nequip.model import model_from_config
+from nequip.scripts.train import _set_global_options, default_config
 from nequip.train import Trainer
 from nequip.utils import Config
 from nequip.utils.versions import check_code_version, get_config_code_versions
@@ -34,6 +36,7 @@
 N_SPECIES_KEY: Final[str] = "n_species"
 TYPE_NAMES_KEY: Final[str] = "type_names"
 JIT_BAILOUT_KEY: Final[str] = "_jit_bailout_depth"
+JIT_FUSION_STRATEGY: Final[str] = "_jit_fusion_strategy"
 TF32_KEY: Final[str] = "allow_tf32"
 
 _ALL_METADATA_KEYS = [
@@ -45,6 +48,7 @@
     N_SPECIES_KEY,
     TYPE_NAMES_KEY,
     JIT_BAILOUT_KEY,
+    JIT_FUSION_STRATEGY,
     TF32_KEY,
 ]
 
@@ -113,8 +117,23 @@ def load_deployed_model(
                 torch.backends.cudnn.allow_tf32 = allow_tf32
 
         # JIT bailout
-        if metadata[JIT_BAILOUT_KEY] != "":
-            jit_bailout: int = int(metadata[JIT_BAILOUT_KEY])
+        if int(torch.__version__.split(".")[1]) >= 11:
+            strategy = metadata.get(JIT_FUSION_STRATEGY, "")
+            if strategy != "":
+                strategy = [e.split(",") for e in strategy.split(";")]
+                strategy = [(e[0], int(e[1])) for e in strategy]
+            else:
+                strategy = default_config[JIT_FUSION_STRATEGY]
+            old_strat = torch.jit.set_fusion_strategy(strategy)
+            if set_global_options == "warn" and old_strat != strategy:
+                warnings.warn(
+                    f"Loaded model had a different value for _jit_fusion_strategy ({strategy}) than was currently set ({old_strat}); changing the GLOBAL setting!"
+                )
+        else:
+            jit_bailout: int = metadata.get(JIT_BAILOUT_KEY, "")
+            if jit_bailout == "":
+                jit_bailout = default_config[JIT_BAILOUT_KEY]
+            jit_bailout = int(jit_bailout)
             # no way to get current value, so assume we are overwriting it
             if set_global_options == "warn":
                 warnings.warn(
@@ -145,8 +164,13 @@ def main(args=None):
 
     build_parser = subparsers.add_parser("build", help="Build a deployment model")
     build_parser.add_argument(
-        "train_dir",
-        help="Path to a working directory from a training session.",
+        "--model",
+        help="Path to a YAML file defining a model to deploy. Unless you know why you need to, do not use this option.",
+        type=pathlib.Path,
+    )
+    build_parser.add_argument(
+        "--train-dir",
+        help="Path to a working directory from a training session to deploy.",
         type=pathlib.Path,
     )
     build_parser.add_argument(
@@ -170,23 +194,29 @@ def main(args=None):
         print(config)
 
     elif args.command == "build":
-        if not args.train_dir.is_dir():
-            raise ValueError(f"{args.train_dir} is not a directory")
-        if args.out_file.is_dir():
-            raise ValueError(
-                f"{args.out_dir} is a directory, but a path to a file for the deployed model must be given"
-            )
+        if args.model and args.train_dir:
+            raise ValueError("--model and --train-dir cannot both be specified.")
+        if args.train_dir is not None:
+            logging.info("Loading best_model from training session...")
+            config = Config.from_file(str(args.train_dir / "config.yaml"))
+        elif args.model is not None:
+            logging.info("Building model from config...")
+            config = Config.from_file(str(args.model), defaults=default_config)
+        else:
+            raise ValueError("one of --train-dir or --model must be given")
 
-        # load config
-        config = Config.from_file(str(args.train_dir / "config.yaml"))
         _set_global_options(config)
-
         check_code_version(config)
 
         # -- load model --
-        model, _ = Trainer.load_model_from_training_session(
-            args.train_dir, model_name="best_model.pth", device="cpu"
-        )
+        if args.train_dir is not None:
+            model, _ = Trainer.load_model_from_training_session(
+                args.train_dir, model_name="best_model.pth", device="cpu"
+            )
+        elif args.model is not None:
+            model = model_from_config(config)
+        else:
+            raise AssertionError
 
         # -- compile --
         model = _compile_for_deploy(model)
@@ -217,9 +247,13 @@ def main(args=None):
         metadata[N_SPECIES_KEY] = str(n_species)
         metadata[TYPE_NAMES_KEY] = " ".join(type_names)
 
-        metadata[JIT_BAILOUT_KEY] = str(config["_jit_bailout_depth"])
+        metadata[JIT_BAILOUT_KEY] = str(config[JIT_BAILOUT_KEY])
+        if int(torch.__version__.split(".")[1]) >= 11 and JIT_FUSION_STRATEGY in config:
+            metadata[JIT_FUSION_STRATEGY] = ";".join(
+                "%s,%i" % e for e in config[JIT_FUSION_STRATEGY]
+            )
         metadata[TF32_KEY] = str(int(config["allow_tf32"]))
-        metadata[CONFIG_KEY] = (args.train_dir / "config.yaml").read_text()
+        metadata[CONFIG_KEY] = yaml.dump(dict(config))
 
         metadata = {k: v.encode("ascii") for k, v in metadata.items()}
         torch.jit.save(model, args.out_file, _extra_files=metadata)
diff --git a/nequip/scripts/evaluate.py b/nequip/scripts/evaluate.py
index 12c521c6..093bc663 100644
--- a/nequip/scripts/evaluate.py
+++ b/nequip/scripts/evaluate.py
@@ -24,7 +24,7 @@
 
 
 def main(args=None, running_as_script: bool = True):
-    # in results dir, do: nequip-deploy build . deployed.pth
+    # in results dir, do: nequip-deploy build --train-dir . deployed.pth
     parser = argparse.ArgumentParser(
         description=textwrap.dedent(
             """Compute the error of a model on a test set using various metrics.
diff --git a/nequip/scripts/train.py b/nequip/scripts/train.py
index a4304e5b..67cbdcc8 100644
--- a/nequip/scripts/train.py
+++ b/nequip/scripts/train.py
@@ -44,6 +44,15 @@
     grad_anomaly_mode=False,
     append=False,
     _jit_bailout_depth=2,  # avoid 20 iters of pain, see https://github.com/pytorch/pytorch/issues/52286
+    # Quote from eelison in PyTorch slack:
+    # https://pytorch.slack.com/archives/CDZD1FANA/p1644259272007529?thread_ts=1644064449.039479&cid=CDZD1FANA
+    # > Right now the default behavior is to specialize twice on static shapes and then on dynamic shapes.
+    # > To reduce warmup time you can do something like setFusionStrartegy({{FusionBehavior::DYNAMIC, 3}})
+    # > ... Although we would wouldn't really expect to recompile a dynamic shape fusion in a model,
+    # > provided broadcasting patterns remain fixed
+    # We default to DYNAMIC alone because the number of edges is always dynamic,
+    # even if the number of atoms is fixed:
+    _jit_fusion_strategy=[("DYNAMIC", 3)],
 )
 
 
@@ -118,9 +127,14 @@ def _set_global_options(config):
             torch.backends.cuda.matmul.allow_tf32 = False
             torch.backends.cudnn.allow_tf32 = False
 
-    # For avoiding 20 steps of painfully slow JIT recompilation
-    # See https://github.com/pytorch/pytorch/issues/52286
-    torch._C._jit_set_bailout_depth(config["_jit_bailout_depth"])
+    if int(torch.__version__.split(".")[1]) >= 11:
+        # PyTorch >= 1.11
+        k = "_jit_fusion_strategy"
+        torch.jit.set_fusion_strategy(config.get(k))
+    else:
+        # For avoiding 20 steps of painfully slow JIT recompilation
+        # See https://github.com/pytorch/pytorch/issues/52286
+        torch._C._jit_set_bailout_depth(config["_jit_bailout_depth"])
 
     if config.model_debug_mode:
         set_irreps_debug(enabled=True)
diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index 6257fbca..3b8a1698 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -257,6 +257,7 @@ def __init__(
         **kwargs,
     ):
         self._initialized = False
+        self.cumulative_wall = 0
         logging.debug("* Initialize Trainer")
 
         # store all init arguments
@@ -408,14 +409,14 @@ def init_objects(self):
         )
         n_args = 0
         for key, item in kwargs.items():
-            # prepand VALIDATION string if k is not with
+            # prepend VALIDATION string if k is not with
             if isinstance(item, dict):
                 new_dict = {}
                 for k, v in item.items():
                     if (
                         k.lower().startswith(VALIDATION)
                         or k.lower().startswith(TRAIN)
-                        or k.lower() in ["lr", "wall"]
+                        or k.lower() in ["lr", "wall", "cumulative_wall"]
                     ):
                         new_dict[k] = item[k]
                     else:
@@ -435,7 +436,7 @@ def init_objects(self):
             for key in self.train_on_keys:
                 if key not in self.model.irreps_out:
                     raise RuntimeError(
-                        "Loss function include fields that are not predicted by the model"
+                        f"Loss function include fields {key} that are not predicted by the model {self.model.irreps_out}"
                     )
 
     @property
@@ -499,6 +500,7 @@ def as_dict(
                 dictionary["state_dict"]["cuda_rng_state"] = torch.cuda.get_rng_state(
                     device=self.torch_device
                 )
+            dictionary["state_dict"]["cumulative_wall"] = self.cumulative_wall
 
         if training_progress:
             dictionary["progress"] = {}
@@ -634,6 +636,7 @@ def from_dict(cls, dictionary, append: Optional[bool] = None):
                 if item is not None:
                     item.load_state_dict(state_dict[key])
             trainer._initialized = True
+            trainer.cumulative_wall = state_dict["cumulative_wall"]
 
             torch.set_rng_state(state_dict["rng_state"])
             trainer.dataset_rng.set_state(state_dict["dataset_rng_state"])
@@ -711,6 +714,7 @@ def init(self):
         self.init_objects()
 
         self._initialized = True
+        self.cumulative_wall = 0
 
     def init_metrics(self):
         if self.metrics_components is None:
@@ -750,6 +754,7 @@ def train(self):
 
         self.init_log()
         self.wall = perf_counter()
+        self.previous_cumulative_wall = self.cumulative_wall
 
         with atomic_write_group():
             if self.iepoch == -1:
@@ -1030,7 +1035,9 @@ def final_log(self):
 
         self.logger.info(f"! Stop training: {self.stop_arg}")
         wall = perf_counter() - self.wall
+        self.cumulative_wall = wall + self.previous_cumulative_wall
         self.logger.info(f"Wall time: {wall}")
+        self.logger.info(f"Cumulative wall time: {self.cumulative_wall}")
 
     def end_of_epoch_log(self):
         """
@@ -1039,10 +1046,12 @@ def end_of_epoch_log(self):
 
         lr = self.optim.param_groups[0]["lr"]
         wall = perf_counter() - self.wall
+        self.cumulative_wall = wall + self.previous_cumulative_wall
         self.mae_dict = dict(
             LR=lr,
             epoch=self.iepoch,
             wall=wall,
+            cumulative_wall=self.cumulative_wall,
         )
 
         header = "epoch, wall, LR"
@@ -1068,7 +1077,7 @@ def end_of_epoch_log(self):
             # append details from loss
             for key, value in self.loss_dict[category].items():
                 mat_str += f", {value:16.5g}"
-                header += f", {category}_{key}"
+                header += f",{category}_{key}"
                 log_str[category] += f" {value:12.3g}"
                 log_header[category] += f" {key:>12.12}"
                 self.mae_dict[f"{category}_{key}"] = value
@@ -1076,7 +1085,7 @@ def end_of_epoch_log(self):
             # append details from metrics
             for key, value in met.items():
                 mat_str += f", {value:12.3g}"
-                header += f", {category}_{key}"
+                header += f",{category}_{key}"
                 if key not in skip_keys:
                     log_str[category] += f" {value:12.3g}"
                     log_header[category] += f" {key:>12.12}"
diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py
index 930af115..3d23cf84 100644
--- a/nequip/utils/regressor.py
+++ b/nequip/utils/regressor.py
@@ -16,7 +16,7 @@ def solver(X, y, regressor: Optional[str] = "NormalizedGaussianProcess", **kwarg
 
 
 def normalized_gp(X, y, **kwargs):
-    feature_rms = 1.0 / np.sqrt(np.average(X ** 2, axis=0))
+    feature_rms = 1.0 / np.sqrt(np.average(X**2, axis=0))
     feature_rms = np.nan_to_num(feature_rms, 1)
     y_mean = torch.sum(y) / torch.sum(X)
     mean, std = base_gp(
diff --git a/nequip/utils/savenload.py b/nequip/utils/savenload.py
index c30f7496..53b09fcf 100644
--- a/nequip/utils/savenload.py
+++ b/nequip/utils/savenload.py
@@ -118,7 +118,6 @@ def finish_all_writes():
         _MOVE_QUEUE.join()
         # ^ wait for all remaining moves to be processed
 
-
 else:
 
     def _submit_move(from_name, to_name, blocking: bool):
diff --git a/setup.py b/setup.py
index ad57b290..e827a58b 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
     description="NequIP is an open-source code for building E(3)-equivariant interatomic potentials.",
     download_url="https://github.com/mir-group/nequip",
     author="Simon Batzner, Albert Musealian, Lixin Sun, Mario Geiger, Anders Johansson, Tess Smidt",
-    python_requires=">=3.6",
+    python_requires=">=3.7",
     packages=find_packages(include=["nequip", "nequip.*"]),
     entry_points={
         # make the scripts available as command line scripts
@@ -29,7 +29,7 @@
         "numpy",
         "ase",
         "tqdm",
-        "torch>=1.8,<=1.11",  # torch.fx added in 1.8
+        "torch>=1.8,<=1.12,!=1.9.0",  # torch.fx added in 1.8
         "e3nn>=0.3.5,<0.5.0",
         "pyyaml",
         "contextlib2;python_version<'3.7'",  # backport of nullcontext
diff --git a/tests/integration/test_deploy.py b/tests/integration/test_deploy.py
index e2a6e500..cc710f11 100644
--- a/tests/integration/test_deploy.py
+++ b/tests/integration/test_deploy.py
@@ -27,14 +27,18 @@ def test_deploy(BENCHMARK_ROOT, device):
     #     # TODO: is this true?
     #     pytest.skip("CUDA and subprocesses have issues")
 
-    keys = [AtomicDataDict.TOTAL_ENERGY_KEY, AtomicDataDict.FORCE_KEY]
+    keys = [
+        AtomicDataDict.TOTAL_ENERGY_KEY,
+        AtomicDataDict.FORCE_KEY,
+        AtomicDataDict.PER_ATOM_ENERGY_KEY,
+    ]
 
     config_path = pathlib.Path(__file__).parents[2] / "configs/minimal.yaml"
     true_config = yaml.load(config_path.read_text(), Loader=yaml.Loader)
     with tempfile.TemporaryDirectory() as tmpdir:
         # Save time
         run_name = "test_deploy" + dtype
-        root = tmpdir + "nequip_rootdir/"
+        root = tmpdir + "/nequip_rootdir/"
         true_config["run_name"] = run_name
         true_config["root"] = root
         true_config["dataset_file_name"] = str(
@@ -54,7 +58,13 @@ def test_deploy(BENCHMARK_ROOT, device):
         # Deploy
         deployed_path = pathlib.Path(f"deployed_{dtype}.pth")
         retcode = subprocess.run(
-            ["nequip-deploy", "build", f"{root}/{run_name}/", str(deployed_path)],
+            [
+                "nequip-deploy",
+                "build",
+                "--train-dir",
+                f"{root}/{run_name}/",
+                str(deployed_path),
+            ],
             cwd=tmpdir,
         )
         retcode.check_returncode()
@@ -73,7 +83,7 @@ def test_deploy(BENCHMARK_ROOT, device):
         dataset = dataset_from_config(Config.from_file(full_config_path))
         data = AtomicData.to_AtomicDataDict(dataset[0].to(device))
         for k in keys:
-            data.pop(k)
+            data.pop(k, None)
         train_pred = best_mod(data)
         train_pred = {k: train_pred[k].to("cpu") for k in keys}
 
@@ -92,7 +102,7 @@ def test_deploy(BENCHMARK_ROOT, device):
         data_idx = 0
         data = AtomicData.to_AtomicDataDict(dataset[data_idx].to("cpu"))
         for k in keys:
-            data.pop(k)
+            data.pop(k, None)
         deploy_pred = deploy_mod(data)
         deploy_pred = {k: deploy_pred[k].to("cpu") for k in keys}
         for k in keys:
@@ -127,10 +137,14 @@ def test_deploy(BENCHMARK_ROOT, device):
         ase_pred = {
             AtomicDataDict.TOTAL_ENERGY_KEY: atoms.get_potential_energy(),
             AtomicDataDict.FORCE_KEY: atoms.get_forces(),
+            AtomicDataDict.PER_ATOM_ENERGY_KEY: atoms.get_potential_energies(),
         }
+        assert ase_pred[AtomicDataDict.TOTAL_ENERGY_KEY].shape == tuple()
+        assert ase_pred[AtomicDataDict.FORCE_KEY].shape == (len(atoms), 3)
+        assert ase_pred[AtomicDataDict.PER_ATOM_ENERGY_KEY].shape == (len(atoms),)
         for k in keys:
             assert torch.allclose(
-                deploy_pred[k],
+                deploy_pred[k].squeeze(-1),
                 torch.as_tensor(ase_pred[k], dtype=torch.get_default_dtype()),
                 atol=atol,
             )
diff --git a/tests/unit/data/test_dataset.py b/tests/unit/data/test_dataset.py
index f0a04832..99616cb1 100644
--- a/tests/unit/data/test_dataset.py
+++ b/tests/unit/data/test_dataset.py
@@ -34,6 +34,7 @@ def ase_file(molecules):
 
 @pytest.fixture(scope="function")
 def npz():
+    np.random.seed(0)
     natoms = NATOMS
     nframes = 8
     yield dict(
@@ -219,6 +220,7 @@ def test_per_graph_field(self, npz_dataset, fixed_field, subset, key, dim):
         if npz_dataset is None:
             return
 
+        torch.manual_seed(0)
         E = torch.rand((npz_dataset.len(),) + dim)
         ref_mean = torch.mean(E / NATOMS, dim=0)
         ref_std = torch.std(E / NATOMS, dim=0)
@@ -296,9 +298,9 @@ def test_per_graph_field(
         del Ns
 
         if alpha == 1e-5:
-            ref_mean, ref_std, E = generate_E(N, 1000, 0.0)
+            ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.0)
         else:
-            ref_mean, ref_std, E = generate_E(N, 1000, 0.5)
+            ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.5)
 
         if subset:
             E_orig_order = torch.zeros_like(
@@ -337,7 +339,7 @@ def test_per_graph_field(
             if alpha == 1e-5:
                 assert torch.allclose(mean, ref_mean, rtol=1e-1)
             else:
-                assert torch.allclose(mean, ref_mean, rtol=2)
+                assert torch.allclose(mean, ref_mean, rtol=1)
                 assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100)
         elif regressor == "NormalizedGaussianProcess":
             assert torch.std(mean).numpy() == 0
@@ -435,9 +437,9 @@ def test_from_atoms(self, molecules):
             )
 
 
-def generate_E(N, mean, std):
+def generate_E(N, mean_min, mean_max, std):
     torch.manual_seed(0)
-    ref_mean = torch.rand((N.shape[1])) * mean
+    ref_mean = torch.rand((N.shape[1])) * (mean_max - mean_min) + mean_min
     t_mean = torch.ones((N.shape[0], 1)) * ref_mean.reshape([1, -1])
     ref_std = torch.rand((N.shape[1])) * std
     t_std = torch.ones((N.shape[0], 1)) * ref_std.reshape([1, -1])
diff --git a/tests/unit/utils/test_gp.py b/tests/unit/utils/test_gp.py
new file mode 100644
index 00000000..4792b9d2
--- /dev/null
+++ b/tests/unit/utils/test_gp.py
@@ -0,0 +1,37 @@
+import torch
+import pytest
+
+from nequip.utils.regressor import base_gp
+from sklearn.gaussian_process.kernels import DotProduct
+
+
+# @pytest.mark.parametrize("full_rank", [True, False])
+@pytest.mark.parametrize("full_rank", [False])
+@pytest.mark.parametrize("alpha", [0, 1e-3, 0.1, 1])
+def test_random(full_rank, alpha):
+
+    if alpha == 0 and not full_rank:
+        return
+
+    torch.manual_seed(0)
+    n_samples = 10
+    n_dim = 3
+
+    if full_rank:
+        X = torch.randint(low=1, high=10, size=(n_samples, n_dim))
+    else:
+        X = torch.randint(low=1, high=10, size=(n_samples, 1)) * torch.ones(
+            (n_samples, n_dim)
+        )
+
+    ref_mean = torch.rand((n_dim, 1))
+    y = torch.matmul(X, ref_mean)
+
+    mean, std = base_gp(
+        X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, alpha=0.1
+    )
+
+    if full_rank:
+        assert torch.allclose(ref_mean, mean, rtol=0.5)
+    else:
+        assert torch.allclose(mean, mean[0], rtol=1e-3)