diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
new file mode 100644
index 00000000..0ce4cfa6
--- /dev/null
+++ b/.github/workflows/python-app.yml
@@ -0,0 +1,47 @@
+name: Test NeuralForceField package
+
+on: [push]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        # python-version: ["pypy3.10", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.10"]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+      - name: Install basics
+        run: python -m pip install --upgrade pip setuptools wheel
+      - name: Install package
+        run: python -m pip install .
+      # - name: Install linters
+      #   run: python -m pip install flake8 mypy pylint
+      # - name: Install documentation requirements
+      #   run: python -m pip install -r docs/requirements.txt
+      # - name: Test with flake8
+      #   run: flake8 polymethod
+      # - name: Test with mypy
+      #   run: mypy polymethod
+      # - name: Test with pylint
+      #   run: pylint polymethod
+      - name: Test with pytest
+        run: |
+         pip install pytest pytest-cov
+         pytest nff/tests --doctest-modules --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov=nff --cov-report=xml --cov-report=html
+      - name: Upload pytest test results
+        uses: actions/upload-artifact@v4
+        with:
+          name: pytest-results-${{ matrix.python-version }}
+          path: junit/test-results-${{ matrix.python-version }}.xml
+        if: ${{ always() }}
+      # - name: Test documentation
+      #   run: sphinx-build docs/source docs/build
diff --git a/.gitignore b/.gitignore
index 6181965e..4b6cfd09 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,5 +66,17 @@ dist/
 sandbox_excited/
 build/
 
+# Editor files
+# vim
+*.swp
+*.swo
+
+# pycharm
+.idea/
+
+# coverage and tests
+junit
+.coverage
+
 # required exceptions
 !tutorials/models/ammonia/Ammonia.xyz
diff --git a/nff/data/dataset.py b/nff/data/dataset.py
index 974ccb8c..7bd20958 100644
--- a/nff/data/dataset.py
+++ b/nff/data/dataset.py
@@ -86,6 +86,7 @@ def __init__(
         units: str = "kcal/mol",
         check_props: bool = True,
         do_copy: bool = True,
+        device: str = "cuda"
     ) -> None:
         """Constructor for Dataset class.
 
@@ -108,6 +109,7 @@ def __init__(
             self.props = props
         self.units = units
         self.to_units(units)
+        self.device = device
 
     def __len__(self) -> int:
         """Length of the dataset.
@@ -289,6 +291,7 @@ def _get_periodic_neighbor_list(
                 pbc=True,
                 cutoff=cutoff,
                 directed=(not undirected),
+                device=self.device,
             )
             nbrs, offs = atoms.update_nbr_list()
             nbrlist.append(nbrs)
@@ -444,6 +447,7 @@ def unwrap_xyz(self, mol_dic: dict) -> None:
                 numbers=self.props["nxyz"][i][:, 0],
                 cell=self.props["cell"][i],
                 pbc=True,
+                device=self.device
             )
 
             # recontruct coordinates based on subgraphs index
@@ -577,6 +581,7 @@ def gen_bond_prior(self, cutoff: float, bond_len_dict: dict | None = None) -> No
                 "cutoff": cutoff,
                 "cell": cell,
                 "nbr_torch": False,
+                "device": self.device
             }
 
             # the coordinates have been unwrapped and try to results offsets
diff --git a/nff/data/tests/__init__.py b/nff/tests/__init__.py
similarity index 100%
rename from nff/data/tests/__init__.py
rename to nff/tests/__init__.py
diff --git a/nff/tests/conftest.py b/nff/tests/conftest.py
new file mode 100644
index 00000000..66ca0a47
--- /dev/null
+++ b/nff/tests/conftest.py
@@ -0,0 +1,15 @@
+
+import os
+import pytest
+import torch
+
+torch.set_num_threads(int(os.getenv("OMP_NUM_THREADS", 1)))
+
+
+def pytest_addoption(parser):
+    parser.addoption("--device", action="store", default="cpu", help="Whether to use the CPU or GPU for the tests")
+
+
+@pytest.fixture
+def device(request):
+    return request.config.getoption("--device")
diff --git a/nff/tests/data/azo_diabat.pth.tar b/nff/tests/data/azo_diabat.pth.tar
new file mode 100644
index 00000000..1065bcae
Binary files /dev/null and b/nff/tests/data/azo_diabat.pth.tar differ
diff --git a/nff/tests/data/dataset.pth.tar b/nff/tests/data/dataset.pth.tar
new file mode 100644
index 00000000..51dff90c
Binary files /dev/null and b/nff/tests/data/dataset.pth.tar differ
diff --git a/nff/md/zhu_nakamura/dynamics_test.py b/nff/tests/dynamics_test.py
similarity index 99%
rename from nff/md/zhu_nakamura/dynamics_test.py
rename to nff/tests/dynamics_test.py
index bf391d24..41eb460a 100644
--- a/nff/md/zhu_nakamura/dynamics_test.py
+++ b/nff/tests/dynamics_test.py
@@ -15,8 +15,8 @@
 from ase.io.trajectory import Trajectory
 from ase import Atoms
 
-from nff.md.utils import mol_dot, mol_norm, ZhuNakamuraLogger, atoms_to_nxyz
-from nff.md.nvt_test import NoseHoover, NoseHooverChain
+from nff.md.utils_ax import mol_dot, mol_norm, ZhuNakamuraLogger, atoms_to_nxyz
+from nff.md.nvt_ax import NoseHoover, NoseHooverChain
 from nff.utils.constants import BOHR_RADIUS, FS_TO_AU, AMU_TO_AU, FS_TO_ASE, ASE_TO_FS, EV_TO_AU
 from nff.data import Dataset, collate_dicts
 from nff.utils.cuda import batch_to
diff --git a/nff/io/tests/test_ase.py b/nff/tests/test_ase.py
similarity index 92%
rename from nff/io/tests/test_ase.py
rename to nff/tests/test_ase.py
index 14ee02d7..6ebe735b 100644
--- a/nff/io/tests/test_ase.py
+++ b/nff/tests/test_ase.py
@@ -5,6 +5,8 @@
 import numpy as np
 from ase import Atoms
 
+import pytest
+
 from nff.io.ase import AtomsBatch
 
 
@@ -19,6 +21,8 @@ def compare_dicts(d1: dict, d2: dict):
     for key, value in d1.items():
         if isinstance(value, dict):
             compare_dicts(value, d2[key])
+        elif isinstance(value, str):
+            assert value == d2[key]
         elif isinstance(value, Iterable):
             assert np.allclose(value, d2[key])
         else:
@@ -47,10 +51,17 @@ def get_ethanol():
     return Atoms(nxyz[:, 0].astype(int), positions=nxyz[:, 1:])
 
 
-# @ut.skip("skip this for now")
+@pytest.mark.usefixtures("device")  # Ensure the fixture is accessible
 class TestAtomsBatch(ut.TestCase):
     def setUp(self):
         self.ethanol = get_ethanol()
+        # Access the device value from the pytest fixture
+        self.device = self._test_fixture_device
+
+    @pytest.fixture(autouse=True)
+    def inject_device(self, device):
+        # Automatically set the fixture value to an attribute
+        self._test_fixture_device = device
 
     @ut.skip("skip this for now")
     def test_AtomsBatch(self):
@@ -111,7 +122,7 @@ def test_AtomsBatch(self):
             ]
         )
 
-        atoms_batch = AtomsBatch(self.ethanol, cutoff=2.5)
+        atoms_batch = AtomsBatch(self.ethanol, cutoff=2.5, device=self.device)
         atoms_batch.update_nbr_list()
 
         G1 = nx.from_edgelist(expected_nbrlist_cutoff_2dot5)
@@ -120,13 +131,13 @@ def test_AtomsBatch(self):
         assert nx.is_isomorphic(G1, G2)
 
     def test_get_batch(self):
-        atoms_batch = AtomsBatch(self.ethanol, cutoff=5)
+        atoms_batch = AtomsBatch(self.ethanol, cutoff=5, device=self.device)
         batch = atoms_batch.get_batch()
 
         assert "nxyz" in batch
 
     def test_from_atoms(self):
-        atoms_batch = AtomsBatch.from_atoms(self.ethanol, cutoff=2.5)
+        atoms_batch = AtomsBatch.from_atoms(self.ethanol, cutoff=2.5, device=self.device)
 
         # ensure atomic numbers, positions, and cell are the same
         assert np.allclose(atoms_batch.get_atomic_numbers(), self.ethanol.get_atomic_numbers())
@@ -134,7 +145,7 @@ def test_from_atoms(self):
         assert np.allclose(atoms_batch.get_cell(), self.ethanol.get_cell())
 
     def test_copy(self):
-        atoms_batch = AtomsBatch(self.ethanol, cutoff=2.5)
+        atoms_batch = AtomsBatch(self.ethanol, cutoff=2.5, device=self.device)
         atoms_batch.get_batch()  # update props
         atoms_batch_copy = atoms_batch.copy()
 
@@ -154,7 +165,7 @@ def test_copy(self):
         assert atoms_batch.requires_large_offsets == atoms_batch_copy.requires_large_offsets
 
     def test_fromdict(self):
-        atoms_batch = AtomsBatch(self.ethanol, cutoff=2.5)
+        atoms_batch = AtomsBatch(self.ethanol, cutoff=2.5, device=self.device)
         ab_dict = atoms_batch.todict(update_props=True)
         ab_from_dict = AtomsBatch.fromdict(ab_dict)
 
@@ -183,6 +194,7 @@ def test_fromdict(self):
         compare_dicts(ab_dict_props, ab_dict_again_props)
 
 
+@pytest.mark.usefixtures("device")  # Ensure the fixture is loaded
 class TestPeriodic(ut.TestCase):
     def setUp(self):
         nxyz = np.array(
@@ -205,9 +217,15 @@ def setUp(self):
                 [0.0, 0.0, 5.51891759],
             ]
         )
-        self.quartz = AtomsBatch(nxyz[:, 0].astype(int), positions=nxyz[:, 1:], cell=lattice, pbc=True)
-
-    def test_ase(self):
+        self.quartz = AtomsBatch(nxyz[:, 0].astype(int), positions=nxyz[:, 1:], cell=lattice, pbc=True,
+                                 device=self._test_fixture_device)
+        
+    @pytest.fixture(autouse=True)
+    def inject_device(self, device):
+        # Automatically set the fixture value to an attribute
+        self._test_fixture_device = device
+
+    def test_print(self):
         print(self.quartz)
 
     def test_nbrlist(self):
@@ -469,7 +487,6 @@ def test_nbrlist(self):
             ]
         )
         assert np.allclose(nbrlist, expected_nbrlist)
-        print(offsets)
 
 
 if __name__ == "__main__":
diff --git a/nff/io/tests/__init__.py b/nff/tests/test_data/__init__.py
similarity index 100%
rename from nff/io/tests/__init__.py
rename to nff/tests/test_data/__init__.py
diff --git a/nff/data/tests/data/SrIrO3_bulk_55_nff_all_dataset.pth.tar b/nff/tests/test_data/data/SrIrO3_bulk_55_nff_all_dataset.pth.tar
similarity index 100%
rename from nff/data/tests/data/SrIrO3_bulk_55_nff_all_dataset.pth.tar
rename to nff/tests/test_data/data/SrIrO3_bulk_55_nff_all_dataset.pth.tar
diff --git a/nff/data/tests/test_dataset.py b/nff/tests/test_data/test_dataset.py
similarity index 93%
rename from nff/data/tests/test_dataset.py
rename to nff/tests/test_data/test_dataset.py
index cdbf73bf..16ee7fc4 100644
--- a/nff/data/tests/test_dataset.py
+++ b/nff/tests/test_data/test_dataset.py
@@ -6,6 +6,8 @@
 import numpy as np
 import torch
 
+import pytest
+
 from nff.data.dataset import (
     Dataset,
     concatenate_dict,
@@ -14,8 +16,8 @@
 )
 
 current_path = Path(__file__).parent
-DATASET_PATH = current_path / "../../../tutorials/data/dataset.pth.tar"
-PEROVSKITE_DATA_PATH = current_path / "./data/SrIrO3_bulk_55_nff_all_dataset.pth.tar"
+DATASET_PATH = os.path.join(current_path, "..", "..", "..", "tutorials", "data", "dataset.pth.tar")
+PEROVSKITE_DATA_PATH = os.path.join(current_path, "data", "SrIrO3_bulk_55_nff_all_dataset.pth.tar")
 TARG_NAME = "formula"
 VAL_SIZE = 0.1
 TEST_SIZE = 0.1
@@ -223,6 +225,7 @@ def test_inexistent_list_lists(self):
         self.assertEqual(ab, expected)
 
 
+@pytest.mark.usefixtures("device")  # Ensure the fixture is accessible
 class TestPeriodicDataset(unittest.TestCase):
     def setUp(self):
         self.quartz = {
@@ -248,7 +251,12 @@ def setUp(self):
             ),
         }
 
-        self.qtz_dataset = Dataset(concatenate_dict(*[self.quartz] * 3))
+        self.qtz_dataset = Dataset(concatenate_dict(*[self.quartz] * 3), device=self._test_fixture_device)
+
+    @pytest.fixture(autouse=True)
+    def inject_device(self, device):
+        # Automatically set the fixture value to an attribute
+        self._test_fixture_device = device
 
     def test_neighbor_list(self):
         nbrs, offs = self.qtz_dataset.generate_neighbor_list(cutoff=5)
diff --git a/nff/data/tests/test_stats.py b/nff/tests/test_data/test_stats.py
similarity index 100%
rename from nff/data/tests/test_stats.py
rename to nff/tests/test_data/test_stats.py
diff --git a/nff/tests/test_excited_states_training.py b/nff/tests/test_excited_states_training.py
new file mode 100644
index 00000000..8754539d
--- /dev/null
+++ b/nff/tests/test_excited_states_training.py
@@ -0,0 +1,136 @@
+
+import os
+import pathlib
+
+import torch
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torch.utils.data.sampler import RandomSampler
+
+import pytest
+
+from nff.data import Dataset, split_train_validation_test, collate_dicts
+from nff.train import Trainer, get_model, loss, hooks, metrics, evaluate
+
+
+@pytest.mark.skip("still taking too long, disable for now")
+def test_excited_training(device, tmpdir):
+    # define loss
+    loss_dict = {
+        "mse": [
+            {"coef": 0.01, "params": {"key": "d_00"}},
+            {"coef": 0.01, "params": {"key": "d_11"}},
+            {"coef": 0.01, "params": {"key": "d_22"}},
+            {"coef": 0.2, "params": {"key": "energy_0"}},
+            {"coef": 1, "params": {"key": "energy_0_grad"}},
+            {"coef": 0.1, "params": {"key": "energy_1"}},
+            {"coef": 1, "params": {"key": "energy_1_grad"}},
+            {"coef": 0.5, "params": {"key": "energy_1_energy_0_delta"}},
+        ],
+        "nacv": [{"coef": 1, "params": {"abs": False, "key": "force_nacv_10", "max": False}}],
+    }
+    loss_fn = loss.build_multi_loss(loss_dict)
+
+    # define model
+    diabat_keys = [["d_00", "d_01", "d_02"], ["d_01", "d_11", "d_12"], ["d_02", "d_12", "d_22"]]
+    modelparams = {
+        "feat_dim": 128,
+        "activation": "swish",
+        "n_rbf": 20,
+        "cutoff": 5.0,
+        "num_conv": 3,
+        "output_keys": ["energy_0", "energy_1"],
+        "grad_keys": ["energy_0_grad", "energy_1_grad"],
+        "diabat_keys": diabat_keys,
+        "add_nacv": True,
+    }
+    model = get_model(modelparams, model_type="PainnDiabat")
+
+    # define training
+    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
+    optimizer = Adam(trainable_params, lr=1e-4)
+    train_metrics = [
+        metrics.MeanAbsoluteError("energy_0"),
+        metrics.MeanAbsoluteError("energy_1"),
+        metrics.MeanAbsoluteError("energy_0_grad"),
+        metrics.MeanAbsoluteError("energy_1_grad"),
+        metrics.MeanAbsoluteError("energy_1_energy_0_delta"),
+    ]
+
+    # output
+    outdir = tmpdir
+    train_hooks = [
+        hooks.CSVHook(
+            outdir,
+            metrics=train_metrics,
+        ),
+        hooks.PrintingHook(outdir, metrics=train_metrics, separator=" | ", time_strf="%M:%S"),
+        hooks.ReduceLROnPlateauHook(
+            optimizer=optimizer,
+            # patience in the original paper
+            patience=50,
+            factor=0.5,
+            min_lr=1e-7,
+            window_length=1,
+            stop_after_min=True,
+        ),
+    ]
+
+    # data set
+    dset = Dataset.from_file(os.path.join(pathlib.Path(__file__).parent.absolute(), "data/azo_diabat.pth.tar"))
+    train, val, test = split_train_validation_test(dset, val_size=0.1, test_size=0.1)
+    batch_size = 20
+    train_loader = DataLoader(train, batch_size=batch_size, collate_fn=collate_dicts, sampler=RandomSampler(train))
+    val_loader = DataLoader(val, batch_size=batch_size, collate_fn=collate_dicts)
+    test_loader = DataLoader(test, batch_size=batch_size, collate_fn=collate_dicts)
+
+
+    # train
+    T = Trainer(
+        model_path=outdir,
+        model=model,
+        loss_fn=loss_fn,
+        optimizer=optimizer,
+        train_loader=train_loader,
+        validation_loader=val_loader,
+        checkpoint_interval=1,
+        hooks=train_hooks,
+        mini_batches=1,
+    )
+    T.train(device=device, n_epochs=10)
+
+    # evaluation
+    def correct_nacv(results, targets, key):
+        num_atoms = targets["num_atoms"]
+        if not isinstance(num_atoms, list):
+            num_atoms = num_atoms.tolist()
+        pred = torch.split(torch.cat(results[key]), num_atoms)
+        targ = torch.split(torch.cat(targets[key]), num_atoms)
+
+        real_pred = []
+
+        for p, t in zip(pred, targ):
+            sub_err = (p - t).abs().mean()
+            add_err = (p + t).abs().mean()
+            sign = 1 if sub_err < add_err else -1
+            real_pred.append(sign * p)
+
+        return real_pred
+
+    results, targets, test_loss = evaluate(
+        T.get_best_model(), test_loader, loss_fn=lambda x, y: torch.Tensor([0]), device=device
+    )
+    real_nacv = correct_nacv(results, targets, "force_nacv_10")
+    results["force_nacv_10"] = real_nacv
+
+    en_keys = ["energy_0", "energy_1", "energy_1_energy_0_delta"]
+    grad_keys = ["energy_0_grad", "energy_1_grad"]
+
+    for key in [*en_keys, *grad_keys, "force_nacv_10"]:
+        pred = results[key]
+        targ = targets[key]
+        targ_dim = len(targets["energy_0"][0].shape)
+        fn = torch.stack if targ_dim == 0 else torch.cat
+        pred = torch.cat(pred).reshape(-1)
+        targ = fn(targ).reshape(-1)
+        assert abs(pred - targ).mean() < 12.0
diff --git a/nff/tests/test_training.py b/nff/tests/test_training.py
new file mode 100644
index 00000000..8e7772d5
--- /dev/null
+++ b/nff/tests/test_training.py
@@ -0,0 +1,72 @@
+import os
+import pathlib
+
+import torch
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+
+from nff.data import Dataset, split_train_validation_test, collate_dicts, to_tensor
+from nff.train import Trainer, get_model, loss, hooks, metrics, evaluate
+
+
+def test_training(device, tmpdir):
+    # data set
+    OUTDIR = tmpdir
+    dataset = Dataset.from_file(os.path.join(pathlib.Path(__file__).parent.absolute(), "data", "dataset.pth.tar"))
+    train, val, test = split_train_validation_test(dataset, val_size=0.2, test_size=0.2)
+    train_loader = DataLoader(train, batch_size=50, collate_fn=collate_dicts)
+    val_loader = DataLoader(val, batch_size=50, collate_fn=collate_dicts)
+    test_loader = DataLoader(test, batch_size=50, collate_fn=collate_dicts)
+
+    # define model
+    params = {
+         "n_atom_basis": 256,
+         "n_filters": 256,
+         "n_gaussians": 32,
+         "n_convolutions": 4,
+         "cutoff": 5.0,
+         "trainable_gauss": True,
+         "dropout_rate": 0.2,
+    }
+    model = get_model(params)
+
+
+    # define training
+    loss_fn = loss.build_mse_loss(loss_coef={"energy": 0.01, "energy_grad": 1})
+    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
+    optimizer = Adam(trainable_params, lr=3e-4)
+    train_metrics = [metrics.MeanAbsoluteError("energy"), metrics.MeanAbsoluteError("energy_grad")]
+
+    # output
+    train_hooks = [
+        hooks.MaxEpochHook(7),
+        hooks.CSVHook(
+            OUTDIR,
+            metrics=train_metrics,
+        ),
+        hooks.PrintingHook(OUTDIR, metrics=train_metrics, separator=" | ", time_strf="%M:%S"),
+        hooks.ReduceLROnPlateauHook(
+            optimizer=optimizer, patience=30, factor=0.5, min_lr=1e-7, window_length=1, stop_after_min=True
+        ),
+    ]
+
+    # train
+    T = Trainer(
+        model_path=OUTDIR,
+        model=model,
+        loss_fn=loss_fn,
+        optimizer=optimizer,
+        train_loader=train_loader,
+        validation_loader=val_loader,
+        checkpoint_interval=1,
+        hooks=train_hooks,
+    )
+    T.train(device=device, n_epochs=7)
+
+    # evaluation
+    results, targets, val_loss = evaluate(T.get_best_model(), test_loader, loss_fn, device=device)
+    for key in ["energy_grad", "energy"]:
+        pred = torch.stack(results[key], dim=0).view(-1).detach().cpu().numpy()
+        targ = torch.stack(targets[key], dim=0).view(-1).detach().cpu().numpy()
+        mae = abs(pred - targ).mean()
+        assert mae < 10.0
diff --git a/pyproject.toml b/pyproject.toml
index 2e3c0e6c..01fd4ec8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ dependencies = [
     "rdkit",
     "scikit-learn",
     "scipy",
-    "torch>=2.2.0",
+    "torch >= 2.2.0, < 2.6.0",
     "tqdm",
     "mace-torch>=0.3.4",
     "chgnet>=0.3.5",