diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..fa9471a4 --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +ignore = E226, E501, E741, E743, C901, W503, E203 +max-line-length = 127 +exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,examples,tmp diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 00000000..070f2557 --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,35 @@ +name: Check coding style + +on: + push: + branches: + - main + - develop + pull_request: + branches: + - main + - develop + +jobs: + black: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Black Check + uses: psf/black@stable + with: + version: "22.3.0" + + flake8: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install flake8 + run: | + pip install flake8==4.0.1 + - name: run flake8 + run: | + flake8 . --count --show-source --statistics diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 00000000..e1ee36ad --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,29 @@ +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1f48c043..43e560dc 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: Check Syntax and Run Tests +name: Run Tests on: push: @@ -15,8 +15,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.9] - torch-version: [1.8.0, 1.10.0] + python-version: [3.7, 3.9] + torch-version: [1.8.0, 1.11.0] steps: - uses: actions/checkout@v2 @@ -24,20 +24,15 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - - name: Install flake8 - run: | - pip install flake8 - - name: Lint with flake8 - run: | - flake8 . --count --show-source --statistics - name: Install dependencies env: TORCH: "${{ matrix.torch-version }}" GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | python -m pip install --upgrade pip + pip install setuptools wheel pip install torch==${TORCH} -f https://download.pytorch.org/whl/cpu/torch_stable.html - pip install . + pip install --upgrade-strategy only-if-needed . - name: Install pytest run: | pip install pytest diff --git a/.github/workflows/tests_develop.yml b/.github/workflows/tests_develop.yml index 66a732b1..bae5795e 100644 --- a/.github/workflows/tests_develop.yml +++ b/.github/workflows/tests_develop.yml @@ -1,4 +1,4 @@ -name: Check Syntax and Run Tests +name: Run Tests on: push: @@ -16,7 +16,7 @@ jobs: strategy: matrix: python-version: [3.9] - torch-version: [1.10.0] + torch-version: [1.11.0] steps: - uses: actions/checkout@v2 @@ -24,20 +24,15 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - - name: Install flake8 - run: | - pip install flake8 - - name: Lint with flake8 - run: | - flake8 . --count --show-source --statistics - name: Install dependencies env: TORCH: "${{ matrix.torch-version }}" GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | python -m pip install --upgrade pip + pip install setuptools wheel pip install torch==${TORCH} -f https://download.pytorch.org/whl/cpu/torch_stable.html - pip install . + pip install --upgrade-strategy only-if-needed . - name: Install pytest run: | pip install pytest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..3a5f9bb9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,14 @@ +exclude: '.git|.tox' +default_stages: [commit] +fail_fast: true + +repos: + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + + - repo: https://gitlab.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 diff --git a/CHANGELOG.md b/CHANGELOG.md index df941fe3..c98cb711 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Most recent change on the bottom. -## [Unreleased] +## [Unreleased] - 0.5.4 + +## [0.5.4] - 2022-04-12 +### Added +- `NequIPCalculator` now handles per-atom energies +- Added `initial_model_state_strict` YAML option +- `load_model_state` builder +- fusion strategy support +- `cumulative_wall` for early stopping +- Deploy model from YAML file directly + +### Changed +- Disallow PyTorch 1.9, which has some JIT bugs. +- `nequip-deploy build` now requires `--train-dir` option when specifying the training session +- Minimum Python version is now 3.7 + +### Fixed +- Better error in `Dataset.statistics` when field is missing +- `NequIPCalculator` now outputs energy as scalar rather than `(1, 1)` array +- `dataset: ase` now treats automatically adds `key_mapping` keys to `include_keys`, which is consistant with the npz dataset +- fixed reloading models with `per_species_rescale_scales/shifts` set to `null`/`None` +- graceful exit for `-n 0` in `nequip-benchmark` +- Strictly correct CSV headers for metrics (#198) ## [0.5.3] - 2022-02-23 ### Added diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5aa0f34a..ca9826e2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,6 +19,7 @@ We use the [`black`](https://black.readthedocs.io/en/stable/index.html) code for ``` Please run the formatter before you commit and certainly before you make a PR. The formatter can be easily set up to run automatically on file save in various editors. +You can also use ``pre-commit install`` to install a [pre-commit](https://pre-commit.com/) hook. ## CUDA support diff --git a/README.md b/README.md index b8feca2d..8bccc76a 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ NequIP is an open-source code for building E(3)-equivariant interatomic potentia NequIP requires: * Python >= 3.6 -* PyTorch >= 1.8, <=1.11.*. PyTorch can be installed following the [instructions from their documentation](https://pytorch.org/get-started/locally/). Note that neither `torchvision` nor `torchaudio`, included in the default install command, are needed for NequIP. +* PyTorch >= 1.8, !=1.9, <=1.11.*. PyTorch can be installed following the [instructions from their documentation](https://pytorch.org/get-started/locally/). Note that neither `torchvision` nor `torchaudio`, included in the default install command, are needed for NequIP. To install: @@ -96,7 +96,7 @@ The `nequip-deploy` command is used to deploy the result of a training session i It compiles a NequIP model trained in Python to [TorchScript](https://pytorch.org/docs/stable/jit.html). The result is an optimized model file that has no dependency on the `nequip` Python library, or even on Python itself: ```bash -nequip-deploy build path/to/training/session/ where/to/put/deployed_model.pth +nequip-deploy build --train-dir path/to/training/session/ where/to/put/deployed_model.pth ``` For more details on this command, please run `nequip-deploy --help`. diff --git a/configs/full.yaml b/configs/full.yaml index 64451518..7c9e5507 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -189,7 +189,7 @@ early_stopping_lower_bounds: LR: 1.0e-5 early_stopping_upper_bounds: # stop early if a metric value is higher than the bound - wall: 1.0e+100 + cumulative_wall: 1.0e+100 # loss function loss_coeffs: # different weights to use in a weighted loss functions diff --git a/examples/lj/README.md b/examples/lj/README.md new file mode 100644 index 00000000..424cbfb9 --- /dev/null +++ b/examples/lj/README.md @@ -0,0 +1,10 @@ +Run commands with +``` +PYTHONPATH=`pwd`:$PYTHONPATH nequip-* ... +``` +so that the model from `lj.py` can be imported. + +For example, to create a deployed LJ model `lj.pth`: +```bash +PYTHONPATH=`pwd`:$PYTHONPATH nequip-deploy build --model lj.yaml lj.pth +``` \ No newline at end of file diff --git a/examples/lj/lj.py b/examples/lj/lj.py new file mode 100644 index 00000000..0bc02f63 --- /dev/null +++ b/examples/lj/lj.py @@ -0,0 +1,109 @@ +"""Example implementation of a Lennard-Jones potential in the NequIP framework. + +This serves as a basic example of how to write a NequIP framework model from scratch. +""" + +from typing import Union + +import torch + +from torch_runstats.scatter import scatter + +from nequip.data import AtomicDataDict +from nequip.nn import GraphModuleMixin, SequentialGraphNetwork, AtomwiseReduce + + +# First, we define a model module to do the actual computation: +class LennardJonesModule(GraphModuleMixin, torch.nn.Module): + """NequIP model module implementing a Lennard-Jones potential term. + + See, for example, `lj/cut` in LAMMPS: + https://docs.lammps.org/pair_lj.html + + Args: + initial_epsilon: initial value of the epsilon parameter. + initial_sigma: initial value of the sigma parameter. + trainable: whether epsilon and sigma should be trainable. + Default False. + """ + + def __init__( + self, + initial_epsilon: Union[float, torch.Tensor], + initial_sigma: Union[float, torch.Tensor], + trainable: bool = False, + irreps_in=None, + ) -> None: + super().__init__() + # We have to tell `GraphModuleMixin` what fields we expect in the input and output + # and what their irreps will be. Having basic geometry information (positions and edges) + # in the input is assumed. + # Per-atom energy is a scalar, so 0e. + self._init_irreps(irreps_out={AtomicDataDict.PER_ATOM_ENERGY_KEY: "0e"}) + self.trainable = trainable + eps = torch.as_tensor(initial_epsilon) + sigma = torch.as_tensor(initial_sigma) + assert eps.ndim == sigma.ndim == 0, "epsilon and sigma must be scalars" + if self.trainable: + self.epsilon = torch.nn.Parameter(eps) + self.sigma = torch.nn.Parameter(sigma) + else: + # buffers act like parameters, but are not trainable + self.register_buffer("epsilon", eps) + self.register_buffer("sigma", sigma) + + def forward(self, data: AtomicDataDict.Type) -> AtomicDataDict.Type: + """Run the module. + + The module both takes and returns an `AtomicDataDict.Type` = `Dict[str, torch.Tensor]`. + Keys that the module does not modify/add are expected to be propagated to the output unchanged. + """ + # If they are not already present, compute and add the edge vectors and lengths to `data`: + data = AtomicDataDict.with_edge_vectors(data, with_lengths=True) + # compute the LJ energy: + lj_eng = (self.sigma / data[AtomicDataDict.EDGE_LENGTH_KEY]) ** 6.0 + lj_eng = torch.neg(lj_eng) + lj_eng = lj_eng + lj_eng.square() + # 2.0 because we do the slightly wastefull symmetric thing and let + # ij and ji each contribute half + # this avoids indexing out certain edges in the general case where + # the edges are not ordered. + lj_eng = (2.0 * self.epsilon) * lj_eng + # assign halves to centers + atomic_eng = scatter( + lj_eng, + # the edge indexes are of shape [2, n_edge]; + # edge_index[0] is the index of the central atom of each edge + data[AtomicDataDict.EDGE_INDEX_KEY][0], + dim=0, + # dim_size says that even if some atoms have no edges, we still + # want an output energy for them (it will be zero) + dim_size=len(data[AtomicDataDict.POSITIONS_KEY]), + ) + # NequIP defines standardized keys for typical fields: + data[AtomicDataDict.PER_ATOM_ENERGY_KEY] = atomic_eng + return data + + +# then, we define a *model builder* function that builds an LJ energy model +# from this and other modules: +def LennardJonesPotential(config) -> SequentialGraphNetwork: + # `from_parameters` builds a model containing each of these modules in sequence + # from a configuration `config` + return SequentialGraphNetwork.from_parameters( + shared_params=config, + layers={ + # LennardJonesModule will be built using options from `config` + "lj": LennardJonesModule, + # AtomwiseReduce will be built using the provided default parameters, + # and also those from `config`. + "total_energy_sum": ( + AtomwiseReduce, + dict( + reduce="sum", + field=AtomicDataDict.PER_ATOM_ENERGY_KEY, + out_field=AtomicDataDict.TOTAL_ENERGY_KEY, + ), + ), + }, + ) diff --git a/examples/lj/lj.yaml b/examples/lj/lj.yaml new file mode 100644 index 00000000..ecd07b84 --- /dev/null +++ b/examples/lj/lj.yaml @@ -0,0 +1,15 @@ +# model +model_builders: + - lj.LennardJonesPotential + # LennardJonesPotential gives an energy model + # ForceOutput takes an energy model and wraps it with an + # autodifferentiation call to get forces too: + - ForceOutput + +initial_epsilon: 1 +initial_sigma: 1 + +# system +r_max: 4.0 +chemical_symbols: + - H diff --git a/examples/monkeypatch.py b/examples/monkeypatch.py index 1234054f..052e3a94 100644 --- a/examples/monkeypatch.py +++ b/examples/monkeypatch.py @@ -5,18 +5,17 @@ convolution for later analysis. """ -import torch - from nequip.utils import Config, find_first_of_type from nequip.data import AtomicDataDict, AtomicData, dataset_from_config from nequip.nn import SequentialGraphNetwork, SaveForOutput +from nequip.train import Trainer # The path to the original training session path = "../results/aspirin/minimal" # Load the model -model = torch.load(path + "/best_model.pth") - +# there are other ways to do this, such as model_from_config etc. +model = Trainer.load_model_from_training_session(traindir=path) # Find the SequentialGraphNetwork, which contains the # sequential bulk of the NequIP GNN model. To see the diff --git a/nequip/_version.py b/nequip/_version.py index e99ddf2f..44e13b18 100644 --- a/nequip/_version.py +++ b/nequip/_version.py @@ -2,4 +2,4 @@ # See Python packaging guide # https://packaging.python.org/guides/single-sourcing-package-version/ -__version__ = "0.5.3" +__version__ = "0.5.4" diff --git a/nequip/ase/nequip_calculator.py b/nequip/ase/nequip_calculator.py index ef14e196..2f4d0aba 100644 --- a/nequip/ase/nequip_calculator.py +++ b/nequip/ase/nequip_calculator.py @@ -24,7 +24,7 @@ class NequIPCalculator(Calculator): """ - implemented_properties = ["energy", "forces"] + implemented_properties = ["energy", "energies", "forces"] def __init__( self, @@ -39,6 +39,9 @@ def __init__( Calculator.__init__(self, **kwargs) self.results = {} self.model = model + assert isinstance( + model, torch.nn.Module + ), "To build a NequIPCalculator from a deployed model, use NequIPCalculator.from_deployed_model" self.r_max = r_max self.device = device self.energy_units_to_eV = energy_units_to_eV @@ -113,7 +116,9 @@ def calculate(self, atoms=None, properties=["energy"], system_changes=all_change # predict + extract data out = self.model(data) forces = out[AtomicDataDict.FORCE_KEY].detach().cpu().numpy() - energy = out[AtomicDataDict.TOTAL_ENERGY_KEY].detach().cpu().numpy() + energy = ( + out[AtomicDataDict.TOTAL_ENERGY_KEY].detach().cpu().numpy().reshape(tuple()) + ) # store results self.results = { @@ -121,3 +126,12 @@ def calculate(self, atoms=None, properties=["energy"], system_changes=all_change # force has units eng / len: "forces": forces * (self.energy_units_to_eV / self.length_units_to_A), } + + if AtomicDataDict.PER_ATOM_ENERGY_KEY in out: + self.results["energies"] = self.energy_units_to_eV * ( + out[AtomicDataDict.PER_ATOM_ENERGY_KEY] + .detach() + .squeeze(-1) + .cpu() + .numpy() + ) diff --git a/nequip/ase/nosehoover.py b/nequip/ase/nosehoover.py index 04827870..5f9a2de8 100644 --- a/nequip/ase/nosehoover.py +++ b/nequip/ase/nosehoover.py @@ -100,7 +100,7 @@ def step(self): nvt_bath_halfstep = self.nvt_bath + 0.5 * self.dt * e_kin_diff / self.nvt_q e_kin_diff_halfstep = 0.5 * ( - np.sum(masses * np.sum(vel_halfstep ** 2, axis=1)) + np.sum(masses * np.sum(vel_halfstep**2, axis=1)) - (3 * self.natoms + 1) * units.kB * self.temp ) self.nvt_bath = ( diff --git a/nequip/data/AtomicData.py b/nequip/data/AtomicData.py index 4848b041..c7fce266 100644 --- a/nequip/data/AtomicData.py +++ b/nequip/data/AtomicData.py @@ -369,7 +369,10 @@ def from_ase( + list(kwargs.keys()) ) # the keys that are duplicated in kwargs are removed from the include_keys - include_keys = list(set(include_keys + ase_all_properties) - default_args) + include_keys = list( + set(include_keys + ase_all_properties + list(key_mapping.keys())) + - default_args + ) km = { "forces": AtomicDataDict.FORCE_KEY, diff --git a/nequip/data/dataset.py b/nequip/data/dataset.py index 9d62b20f..f95465bf 100644 --- a/nequip/data/dataset.py +++ b/nequip/data/dataset.py @@ -422,6 +422,10 @@ def statistics( assert arr_is_per in ("node", "graph", "edge") else: # Give a better error + if field not in ff_transformed and field not in data_transformed: + raise RuntimeError( + f"Field `{field}` for which statistics were requested not found in data." + ) if field not in selectors: # this means field is not selected and so not available raise RuntimeError( diff --git a/nequip/model/__init__.py b/nequip/model/__init__.py index debbe3c8..d34e385a 100644 --- a/nequip/model/__init__.py +++ b/nequip/model/__init__.py @@ -1,7 +1,11 @@ from ._eng import EnergyModel, SimpleIrrepsConfig from ._grads import ForceOutput, PartialForceOutput from ._scaling import RescaleEnergyEtc, PerSpeciesRescale -from ._weight_init import uniform_initialize_FCs, initialize_from_state +from ._weight_init import ( + uniform_initialize_FCs, + initialize_from_state, + load_model_state, +) from ._build import model_from_config @@ -16,6 +20,7 @@ PerSpeciesRescale, uniform_initialize_FCs, initialize_from_state, + load_model_state, model_from_config, builder_utils, ] diff --git a/nequip/model/_build.py b/nequip/model/_build.py index 3fbc59da..0fe4e21d 100644 --- a/nequip/model/_build.py +++ b/nequip/model/_build.py @@ -2,8 +2,9 @@ from typing import Optional from nequip.data import AtomicDataset +from nequip.data.transforms import TypeMapper from nequip.nn import GraphModuleMixin -from nequip.utils import load_callable +from nequip.utils import load_callable, instantiate def model_from_config( @@ -26,17 +27,26 @@ def model_from_config( The build model. """ # Pre-process config - if initialize and dataset is not None: + type_mapper = None + if dataset is not None: + type_mapper = dataset.type_mapper + else: + try: + type_mapper, _ = instantiate(TypeMapper, all_args=config) + except RuntimeError: + pass + + if type_mapper is not None: if "num_types" in config: assert ( - config["num_types"] == dataset.type_mapper.num_types + config["num_types"] == type_mapper.num_types ), "inconsistant config & dataset" if "type_names" in config: assert ( - config["type_names"] == dataset.type_mapper.type_names + config["type_names"] == type_mapper.type_names ), "inconsistant config & dataset" - config["num_types"] = dataset.type_mapper.num_types - config["type_names"] = dataset.type_mapper.type_names + config["num_types"] = type_mapper.num_types + config["type_names"] = type_mapper.type_names # Build builders = [ diff --git a/nequip/model/_scaling.py b/nequip/model/_scaling.py index 03ce4f79..37f360ad 100644 --- a/nequip/model/_scaling.py +++ b/nequip/model/_scaling.py @@ -226,13 +226,12 @@ def PerSpeciesRescale( ) else: - # Put dummy values # the real ones will be loaded from the state dict later # note that the state dict includes buffers, # so this is fine regardless of whether its trainable. - scales = 1.0 - shifts = 0.0 + scales = 1.0 if scales is not None else None + shifts = 0.0 if shifts is not None else None # values correctly scaled according to where the come from # will be brought from the state dict later, # so what you set this to doesnt matter: diff --git a/nequip/model/_weight_init.py b/nequip/model/_weight_init.py index 60783be3..7d6184c4 100644 --- a/nequip/model/_weight_init.py +++ b/nequip/model/_weight_init.py @@ -11,16 +11,45 @@ # == Load old state == def initialize_from_state(config: Config, model: GraphModuleMixin, initialize: bool): - """Initialize the model from the state dict file given by the config options `initial_model_state`.""" + """Initialize the model from the state dict file given by the config options `initial_model_state`. + + Only loads the state dict if `initialize` is `True`; this is meant for, say, starting a training from a previous state. + + If `initial_model_state_strict` controls + > whether to strictly enforce that the keys in state_dict + > match the keys returned by this module's state_dict() function + + See https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict. + """ if not initialize: return model # do nothing - key = "initial_model_state" - if key not in config: + return load_model_state( + config=config, model=model, initialize=initialize, _prefix="initial_model_state" + ) + + +def load_model_state( + config: Config, + model: GraphModuleMixin, + initialize: bool, + _prefix: str = "load_model_state", +): + """Load the model from the state dict file given by the config options `load_model_state`. + + Loads the state dict always; this is meant, for example, for building a new model to deploy with a given state dict. + + If `load_model_state_strict` controls + > whether to strictly enforce that the keys in state_dict + > match the keys returned by this module's state_dict() function + + See https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict. + """ + if _prefix not in config: raise KeyError( - f"initialize_from_state requires the `{key}` option specifying the state to initialize from" + f"initialize_from_state requires the `{_prefix}` option specifying the state to initialize from" ) - state = torch.load(config[key]) - model.load_state_dict(state) + state = torch.load(config[_prefix]) + model.load_state_dict(state, strict=config.get(_prefix + "_strict", True)) return model diff --git a/nequip/nn/_interaction_block.py b/nequip/nn/_interaction_block.py index 6f70af20..99b3acc6 100644 --- a/nequip/nn/_interaction_block.py +++ b/nequip/nn/_interaction_block.py @@ -173,7 +173,7 @@ def forward(self, data: AtomicDataDict.Type) -> AtomicDataDict.Type: # Necessary to get TorchScript to be able to type infer when its not None avg_num_neigh: Optional[float] = self.avg_num_neighbors if avg_num_neigh is not None: - x = x.div(avg_num_neigh ** 0.5) + x = x.div(avg_num_neigh**0.5) x = self.linear_2(x) diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index f0803ff0..327f9016 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -90,6 +90,10 @@ def main(args=None): datas = itertools.cycle(datas) + if args.n == 0: + print("Got -n 0, so quitting without running benchmark.") + return + # Load model: print("Loading model... ") model = model_from_config(config, initialize=True, dataset=dataset) diff --git a/nequip/scripts/deploy.py b/nequip/scripts/deploy.py index 736de9a4..48cc187e 100644 --- a/nequip/scripts/deploy.py +++ b/nequip/scripts/deploy.py @@ -9,6 +9,7 @@ import pathlib import logging import warnings +import yaml # This is a weird hack to avoid Intel MKL issues on the cluster when this is called as a subprocess of a process that has itself initialized PyTorch. # Since numpy gets imported later anyway for dataset stuff, this shouldn't affect performance. @@ -20,7 +21,8 @@ from e3nn.util.jit import script -from nequip.scripts.train import _set_global_options +from nequip.model import model_from_config +from nequip.scripts.train import _set_global_options, default_config from nequip.train import Trainer from nequip.utils import Config from nequip.utils.versions import check_code_version, get_config_code_versions @@ -34,6 +36,7 @@ N_SPECIES_KEY: Final[str] = "n_species" TYPE_NAMES_KEY: Final[str] = "type_names" JIT_BAILOUT_KEY: Final[str] = "_jit_bailout_depth" +JIT_FUSION_STRATEGY: Final[str] = "_jit_fusion_strategy" TF32_KEY: Final[str] = "allow_tf32" _ALL_METADATA_KEYS = [ @@ -45,6 +48,7 @@ N_SPECIES_KEY, TYPE_NAMES_KEY, JIT_BAILOUT_KEY, + JIT_FUSION_STRATEGY, TF32_KEY, ] @@ -113,8 +117,23 @@ def load_deployed_model( torch.backends.cudnn.allow_tf32 = allow_tf32 # JIT bailout - if metadata[JIT_BAILOUT_KEY] != "": - jit_bailout: int = int(metadata[JIT_BAILOUT_KEY]) + if int(torch.__version__.split(".")[1]) >= 11: + strategy = metadata.get(JIT_FUSION_STRATEGY, "") + if strategy != "": + strategy = [e.split(",") for e in strategy.split(";")] + strategy = [(e[0], int(e[1])) for e in strategy] + else: + strategy = default_config[JIT_FUSION_STRATEGY] + old_strat = torch.jit.set_fusion_strategy(strategy) + if set_global_options == "warn" and old_strat != strategy: + warnings.warn( + f"Loaded model had a different value for _jit_fusion_strategy ({strategy}) than was currently set ({old_strat}); changing the GLOBAL setting!" + ) + else: + jit_bailout: int = metadata.get(JIT_BAILOUT_KEY, "") + if jit_bailout == "": + jit_bailout = default_config[JIT_BAILOUT_KEY] + jit_bailout = int(jit_bailout) # no way to get current value, so assume we are overwriting it if set_global_options == "warn": warnings.warn( @@ -145,8 +164,13 @@ def main(args=None): build_parser = subparsers.add_parser("build", help="Build a deployment model") build_parser.add_argument( - "train_dir", - help="Path to a working directory from a training session.", + "--model", + help="Path to a YAML file defining a model to deploy. Unless you know why you need to, do not use this option.", + type=pathlib.Path, + ) + build_parser.add_argument( + "--train-dir", + help="Path to a working directory from a training session to deploy.", type=pathlib.Path, ) build_parser.add_argument( @@ -170,23 +194,29 @@ def main(args=None): print(config) elif args.command == "build": - if not args.train_dir.is_dir(): - raise ValueError(f"{args.train_dir} is not a directory") - if args.out_file.is_dir(): - raise ValueError( - f"{args.out_dir} is a directory, but a path to a file for the deployed model must be given" - ) + if args.model and args.train_dir: + raise ValueError("--model and --train-dir cannot both be specified.") + if args.train_dir is not None: + logging.info("Loading best_model from training session...") + config = Config.from_file(str(args.train_dir / "config.yaml")) + elif args.model is not None: + logging.info("Building model from config...") + config = Config.from_file(str(args.model), defaults=default_config) + else: + raise ValueError("one of --train-dir or --model must be given") - # load config - config = Config.from_file(str(args.train_dir / "config.yaml")) _set_global_options(config) - check_code_version(config) # -- load model -- - model, _ = Trainer.load_model_from_training_session( - args.train_dir, model_name="best_model.pth", device="cpu" - ) + if args.train_dir is not None: + model, _ = Trainer.load_model_from_training_session( + args.train_dir, model_name="best_model.pth", device="cpu" + ) + elif args.model is not None: + model = model_from_config(config) + else: + raise AssertionError # -- compile -- model = _compile_for_deploy(model) @@ -217,9 +247,13 @@ def main(args=None): metadata[N_SPECIES_KEY] = str(n_species) metadata[TYPE_NAMES_KEY] = " ".join(type_names) - metadata[JIT_BAILOUT_KEY] = str(config["_jit_bailout_depth"]) + metadata[JIT_BAILOUT_KEY] = str(config[JIT_BAILOUT_KEY]) + if int(torch.__version__.split(".")[1]) >= 11 and JIT_FUSION_STRATEGY in config: + metadata[JIT_FUSION_STRATEGY] = ";".join( + "%s,%i" % e for e in config[JIT_FUSION_STRATEGY] + ) metadata[TF32_KEY] = str(int(config["allow_tf32"])) - metadata[CONFIG_KEY] = (args.train_dir / "config.yaml").read_text() + metadata[CONFIG_KEY] = yaml.dump(dict(config)) metadata = {k: v.encode("ascii") for k, v in metadata.items()} torch.jit.save(model, args.out_file, _extra_files=metadata) diff --git a/nequip/scripts/evaluate.py b/nequip/scripts/evaluate.py index 12c521c6..093bc663 100644 --- a/nequip/scripts/evaluate.py +++ b/nequip/scripts/evaluate.py @@ -24,7 +24,7 @@ def main(args=None, running_as_script: bool = True): - # in results dir, do: nequip-deploy build . deployed.pth + # in results dir, do: nequip-deploy build --train-dir . deployed.pth parser = argparse.ArgumentParser( description=textwrap.dedent( """Compute the error of a model on a test set using various metrics. diff --git a/nequip/scripts/train.py b/nequip/scripts/train.py index a4304e5b..67cbdcc8 100644 --- a/nequip/scripts/train.py +++ b/nequip/scripts/train.py @@ -44,6 +44,15 @@ grad_anomaly_mode=False, append=False, _jit_bailout_depth=2, # avoid 20 iters of pain, see https://github.com/pytorch/pytorch/issues/52286 + # Quote from eelison in PyTorch slack: + # https://pytorch.slack.com/archives/CDZD1FANA/p1644259272007529?thread_ts=1644064449.039479&cid=CDZD1FANA + # > Right now the default behavior is to specialize twice on static shapes and then on dynamic shapes. + # > To reduce warmup time you can do something like setFusionStrartegy({{FusionBehavior::DYNAMIC, 3}}) + # > ... Although we would wouldn't really expect to recompile a dynamic shape fusion in a model, + # > provided broadcasting patterns remain fixed + # We default to DYNAMIC alone because the number of edges is always dynamic, + # even if the number of atoms is fixed: + _jit_fusion_strategy=[("DYNAMIC", 3)], ) @@ -118,9 +127,14 @@ def _set_global_options(config): torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False - # For avoiding 20 steps of painfully slow JIT recompilation - # See https://github.com/pytorch/pytorch/issues/52286 - torch._C._jit_set_bailout_depth(config["_jit_bailout_depth"]) + if int(torch.__version__.split(".")[1]) >= 11: + # PyTorch >= 1.11 + k = "_jit_fusion_strategy" + torch.jit.set_fusion_strategy(config.get(k)) + else: + # For avoiding 20 steps of painfully slow JIT recompilation + # See https://github.com/pytorch/pytorch/issues/52286 + torch._C._jit_set_bailout_depth(config["_jit_bailout_depth"]) if config.model_debug_mode: set_irreps_debug(enabled=True) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 6257fbca..3b8a1698 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -257,6 +257,7 @@ def __init__( **kwargs, ): self._initialized = False + self.cumulative_wall = 0 logging.debug("* Initialize Trainer") # store all init arguments @@ -408,14 +409,14 @@ def init_objects(self): ) n_args = 0 for key, item in kwargs.items(): - # prepand VALIDATION string if k is not with + # prepend VALIDATION string if k is not with if isinstance(item, dict): new_dict = {} for k, v in item.items(): if ( k.lower().startswith(VALIDATION) or k.lower().startswith(TRAIN) - or k.lower() in ["lr", "wall"] + or k.lower() in ["lr", "wall", "cumulative_wall"] ): new_dict[k] = item[k] else: @@ -435,7 +436,7 @@ def init_objects(self): for key in self.train_on_keys: if key not in self.model.irreps_out: raise RuntimeError( - "Loss function include fields that are not predicted by the model" + f"Loss function include fields {key} that are not predicted by the model {self.model.irreps_out}" ) @property @@ -499,6 +500,7 @@ def as_dict( dictionary["state_dict"]["cuda_rng_state"] = torch.cuda.get_rng_state( device=self.torch_device ) + dictionary["state_dict"]["cumulative_wall"] = self.cumulative_wall if training_progress: dictionary["progress"] = {} @@ -634,6 +636,7 @@ def from_dict(cls, dictionary, append: Optional[bool] = None): if item is not None: item.load_state_dict(state_dict[key]) trainer._initialized = True + trainer.cumulative_wall = state_dict["cumulative_wall"] torch.set_rng_state(state_dict["rng_state"]) trainer.dataset_rng.set_state(state_dict["dataset_rng_state"]) @@ -711,6 +714,7 @@ def init(self): self.init_objects() self._initialized = True + self.cumulative_wall = 0 def init_metrics(self): if self.metrics_components is None: @@ -750,6 +754,7 @@ def train(self): self.init_log() self.wall = perf_counter() + self.previous_cumulative_wall = self.cumulative_wall with atomic_write_group(): if self.iepoch == -1: @@ -1030,7 +1035,9 @@ def final_log(self): self.logger.info(f"! Stop training: {self.stop_arg}") wall = perf_counter() - self.wall + self.cumulative_wall = wall + self.previous_cumulative_wall self.logger.info(f"Wall time: {wall}") + self.logger.info(f"Cumulative wall time: {self.cumulative_wall}") def end_of_epoch_log(self): """ @@ -1039,10 +1046,12 @@ def end_of_epoch_log(self): lr = self.optim.param_groups[0]["lr"] wall = perf_counter() - self.wall + self.cumulative_wall = wall + self.previous_cumulative_wall self.mae_dict = dict( LR=lr, epoch=self.iepoch, wall=wall, + cumulative_wall=self.cumulative_wall, ) header = "epoch, wall, LR" @@ -1068,7 +1077,7 @@ def end_of_epoch_log(self): # append details from loss for key, value in self.loss_dict[category].items(): mat_str += f", {value:16.5g}" - header += f", {category}_{key}" + header += f",{category}_{key}" log_str[category] += f" {value:12.3g}" log_header[category] += f" {key:>12.12}" self.mae_dict[f"{category}_{key}"] = value @@ -1076,7 +1085,7 @@ def end_of_epoch_log(self): # append details from metrics for key, value in met.items(): mat_str += f", {value:12.3g}" - header += f", {category}_{key}" + header += f",{category}_{key}" if key not in skip_keys: log_str[category] += f" {value:12.3g}" log_header[category] += f" {key:>12.12}" diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py index 930af115..3d23cf84 100644 --- a/nequip/utils/regressor.py +++ b/nequip/utils/regressor.py @@ -16,7 +16,7 @@ def solver(X, y, regressor: Optional[str] = "NormalizedGaussianProcess", **kwarg def normalized_gp(X, y, **kwargs): - feature_rms = 1.0 / np.sqrt(np.average(X ** 2, axis=0)) + feature_rms = 1.0 / np.sqrt(np.average(X**2, axis=0)) feature_rms = np.nan_to_num(feature_rms, 1) y_mean = torch.sum(y) / torch.sum(X) mean, std = base_gp( diff --git a/nequip/utils/savenload.py b/nequip/utils/savenload.py index c30f7496..53b09fcf 100644 --- a/nequip/utils/savenload.py +++ b/nequip/utils/savenload.py @@ -118,7 +118,6 @@ def finish_all_writes(): _MOVE_QUEUE.join() # ^ wait for all remaining moves to be processed - else: def _submit_move(from_name, to_name, blocking: bool): diff --git a/setup.py b/setup.py index ad57b290..e827a58b 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ description="NequIP is an open-source code for building E(3)-equivariant interatomic potentials.", download_url="https://github.com/mir-group/nequip", author="Simon Batzner, Albert Musealian, Lixin Sun, Mario Geiger, Anders Johansson, Tess Smidt", - python_requires=">=3.6", + python_requires=">=3.7", packages=find_packages(include=["nequip", "nequip.*"]), entry_points={ # make the scripts available as command line scripts @@ -29,7 +29,7 @@ "numpy", "ase", "tqdm", - "torch>=1.8,<=1.11", # torch.fx added in 1.8 + "torch>=1.8,<=1.12,!=1.9.0", # torch.fx added in 1.8 "e3nn>=0.3.5,<0.5.0", "pyyaml", "contextlib2;python_version<'3.7'", # backport of nullcontext diff --git a/tests/integration/test_deploy.py b/tests/integration/test_deploy.py index e2a6e500..cc710f11 100644 --- a/tests/integration/test_deploy.py +++ b/tests/integration/test_deploy.py @@ -27,14 +27,18 @@ def test_deploy(BENCHMARK_ROOT, device): # # TODO: is this true? # pytest.skip("CUDA and subprocesses have issues") - keys = [AtomicDataDict.TOTAL_ENERGY_KEY, AtomicDataDict.FORCE_KEY] + keys = [ + AtomicDataDict.TOTAL_ENERGY_KEY, + AtomicDataDict.FORCE_KEY, + AtomicDataDict.PER_ATOM_ENERGY_KEY, + ] config_path = pathlib.Path(__file__).parents[2] / "configs/minimal.yaml" true_config = yaml.load(config_path.read_text(), Loader=yaml.Loader) with tempfile.TemporaryDirectory() as tmpdir: # Save time run_name = "test_deploy" + dtype - root = tmpdir + "nequip_rootdir/" + root = tmpdir + "/nequip_rootdir/" true_config["run_name"] = run_name true_config["root"] = root true_config["dataset_file_name"] = str( @@ -54,7 +58,13 @@ def test_deploy(BENCHMARK_ROOT, device): # Deploy deployed_path = pathlib.Path(f"deployed_{dtype}.pth") retcode = subprocess.run( - ["nequip-deploy", "build", f"{root}/{run_name}/", str(deployed_path)], + [ + "nequip-deploy", + "build", + "--train-dir", + f"{root}/{run_name}/", + str(deployed_path), + ], cwd=tmpdir, ) retcode.check_returncode() @@ -73,7 +83,7 @@ def test_deploy(BENCHMARK_ROOT, device): dataset = dataset_from_config(Config.from_file(full_config_path)) data = AtomicData.to_AtomicDataDict(dataset[0].to(device)) for k in keys: - data.pop(k) + data.pop(k, None) train_pred = best_mod(data) train_pred = {k: train_pred[k].to("cpu") for k in keys} @@ -92,7 +102,7 @@ def test_deploy(BENCHMARK_ROOT, device): data_idx = 0 data = AtomicData.to_AtomicDataDict(dataset[data_idx].to("cpu")) for k in keys: - data.pop(k) + data.pop(k, None) deploy_pred = deploy_mod(data) deploy_pred = {k: deploy_pred[k].to("cpu") for k in keys} for k in keys: @@ -127,10 +137,14 @@ def test_deploy(BENCHMARK_ROOT, device): ase_pred = { AtomicDataDict.TOTAL_ENERGY_KEY: atoms.get_potential_energy(), AtomicDataDict.FORCE_KEY: atoms.get_forces(), + AtomicDataDict.PER_ATOM_ENERGY_KEY: atoms.get_potential_energies(), } + assert ase_pred[AtomicDataDict.TOTAL_ENERGY_KEY].shape == tuple() + assert ase_pred[AtomicDataDict.FORCE_KEY].shape == (len(atoms), 3) + assert ase_pred[AtomicDataDict.PER_ATOM_ENERGY_KEY].shape == (len(atoms),) for k in keys: assert torch.allclose( - deploy_pred[k], + deploy_pred[k].squeeze(-1), torch.as_tensor(ase_pred[k], dtype=torch.get_default_dtype()), atol=atol, ) diff --git a/tests/unit/data/test_dataset.py b/tests/unit/data/test_dataset.py index f0a04832..99616cb1 100644 --- a/tests/unit/data/test_dataset.py +++ b/tests/unit/data/test_dataset.py @@ -34,6 +34,7 @@ def ase_file(molecules): @pytest.fixture(scope="function") def npz(): + np.random.seed(0) natoms = NATOMS nframes = 8 yield dict( @@ -219,6 +220,7 @@ def test_per_graph_field(self, npz_dataset, fixed_field, subset, key, dim): if npz_dataset is None: return + torch.manual_seed(0) E = torch.rand((npz_dataset.len(),) + dim) ref_mean = torch.mean(E / NATOMS, dim=0) ref_std = torch.std(E / NATOMS, dim=0) @@ -296,9 +298,9 @@ def test_per_graph_field( del Ns if alpha == 1e-5: - ref_mean, ref_std, E = generate_E(N, 1000, 0.0) + ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.0) else: - ref_mean, ref_std, E = generate_E(N, 1000, 0.5) + ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.5) if subset: E_orig_order = torch.zeros_like( @@ -337,7 +339,7 @@ def test_per_graph_field( if alpha == 1e-5: assert torch.allclose(mean, ref_mean, rtol=1e-1) else: - assert torch.allclose(mean, ref_mean, rtol=2) + assert torch.allclose(mean, ref_mean, rtol=1) assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100) elif regressor == "NormalizedGaussianProcess": assert torch.std(mean).numpy() == 0 @@ -435,9 +437,9 @@ def test_from_atoms(self, molecules): ) -def generate_E(N, mean, std): +def generate_E(N, mean_min, mean_max, std): torch.manual_seed(0) - ref_mean = torch.rand((N.shape[1])) * mean + ref_mean = torch.rand((N.shape[1])) * (mean_max - mean_min) + mean_min t_mean = torch.ones((N.shape[0], 1)) * ref_mean.reshape([1, -1]) ref_std = torch.rand((N.shape[1])) * std t_std = torch.ones((N.shape[0], 1)) * ref_std.reshape([1, -1]) diff --git a/tests/unit/utils/test_gp.py b/tests/unit/utils/test_gp.py new file mode 100644 index 00000000..4792b9d2 --- /dev/null +++ b/tests/unit/utils/test_gp.py @@ -0,0 +1,37 @@ +import torch +import pytest + +from nequip.utils.regressor import base_gp +from sklearn.gaussian_process.kernels import DotProduct + + +# @pytest.mark.parametrize("full_rank", [True, False]) +@pytest.mark.parametrize("full_rank", [False]) +@pytest.mark.parametrize("alpha", [0, 1e-3, 0.1, 1]) +def test_random(full_rank, alpha): + + if alpha == 0 and not full_rank: + return + + torch.manual_seed(0) + n_samples = 10 + n_dim = 3 + + if full_rank: + X = torch.randint(low=1, high=10, size=(n_samples, n_dim)) + else: + X = torch.randint(low=1, high=10, size=(n_samples, 1)) * torch.ones( + (n_samples, n_dim) + ) + + ref_mean = torch.rand((n_dim, 1)) + y = torch.matmul(X, ref_mean) + + mean, std = base_gp( + X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, alpha=0.1 + ) + + if full_rank: + assert torch.allclose(ref_mean, mean, rtol=0.5) + else: + assert torch.allclose(mean, mean[0], rtol=1e-3)