diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
index f164758304..45b689cb3e 100644
--- a/.github/workflows/test_cuda.yml
+++ b/.github/workflows/test_cuda.yml
@@ -42,6 +42,7 @@ jobs:
         DP_BUILD_TESTING: 1
         DP_VARIANT: cuda
         CUDA_PATH: /usr/local/cuda-12.2
+        NUM_WORKERS: 0
     - run: dp --version
     - run: python -m pytest -s --cov=deepmd source/tests --durations=0
     - run: source/install/test_cc_local.sh
diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py
index 7c95f66c9c..7a6684e82e 100644
--- a/deepmd/pt/utils/dataloader.py
+++ b/deepmd/pt/utils/dataloader.py
@@ -276,13 +276,11 @@ def collate_batch(batch):
                 result[key] = torch.zeros(
                     (n_frames, natoms_extended, 3),
                     dtype=env.GLOBAL_PT_FLOAT_PRECISION,
-                    device=env.PREPROCESS_DEVICE,
                 )
             else:
                 result[key] = torch.zeros(
                     (n_frames, natoms_extended),
                     dtype=torch.long,
-                    device=env.PREPROCESS_DEVICE,
                 )
             for i in range(len(batch)):
                 natoms_tmp = list[i].shape[0]
diff --git a/deepmd/pt/utils/dataset.py b/deepmd/pt/utils/dataset.py
index c104e64491..68d4a09ce4 100644
--- a/deepmd/pt/utils/dataset.py
+++ b/deepmd/pt/utils/dataset.py
@@ -477,11 +477,7 @@ def preprocess(self, batch):
             if "find_" in kk:
                 pass
             else:
-                batch[kk] = torch.tensor(
-                    batch[kk],
-                    dtype=env.GLOBAL_PT_FLOAT_PRECISION,
-                    device=env.PREPROCESS_DEVICE,
-                )
+                batch[kk] = torch.tensor(batch[kk], dtype=env.GLOBAL_PT_FLOAT_PRECISION)
                 if self._data_dict[kk]["atomic"]:
                     batch[kk] = batch[kk].view(
                         n_frames, -1, self._data_dict[kk]["ndof"]
@@ -489,9 +485,7 @@ def preprocess(self, batch):
 
         for kk in ["type", "real_natoms_vec"]:
             if kk in batch.keys():
-                batch[kk] = torch.tensor(
-                    batch[kk], dtype=torch.long, device=env.PREPROCESS_DEVICE
-                )
+                batch[kk] = torch.tensor(batch[kk], dtype=torch.long)
         batch["atype"] = batch.pop("type")
 
         keys = ["nlist", "nlist_loc", "nlist_type", "shift", "mapping"]
@@ -524,13 +518,9 @@ def preprocess(self, batch):
         batch["nlist_type"] = nlist_type
         natoms_extended = max([item.shape[0] for item in shift])
         batch["shift"] = torch.zeros(
-            (n_frames, natoms_extended, 3),
-            dtype=env.GLOBAL_PT_FLOAT_PRECISION,
-            device=env.PREPROCESS_DEVICE,
-        )
-        batch["mapping"] = torch.zeros(
-            (n_frames, natoms_extended), dtype=torch.long, device=env.PREPROCESS_DEVICE
+            (n_frames, natoms_extended, 3), dtype=env.GLOBAL_PT_FLOAT_PRECISION
         )
+        batch["mapping"] = torch.zeros((n_frames, natoms_extended), dtype=torch.long)
         for i in range(len(shift)):
             natoms_tmp = shift[i].shape[0]
             batch["shift"][i, :natoms_tmp] = shift[i]
@@ -566,17 +556,13 @@ def single_preprocess(self, batch, sid):
                 pass
             else:
                 batch[kk] = torch.tensor(
-                    batch[kk][sid],
-                    dtype=env.GLOBAL_PT_FLOAT_PRECISION,
-                    device=env.PREPROCESS_DEVICE,
+                    batch[kk][sid], dtype=env.GLOBAL_PT_FLOAT_PRECISION
                 )
                 if self._data_dict[kk]["atomic"]:
                     batch[kk] = batch[kk].view(-1, self._data_dict[kk]["ndof"])
         for kk in ["type", "real_natoms_vec"]:
             if kk in batch.keys():
-                batch[kk] = torch.tensor(
-                    batch[kk][sid], dtype=torch.long, device=env.PREPROCESS_DEVICE
-                )
+                batch[kk] = torch.tensor(batch[kk][sid], dtype=torch.long)
         clean_coord = batch.pop("coord")
         clean_type = batch.pop("type")
         nloc = clean_type.shape[0]
@@ -670,30 +656,22 @@ def single_preprocess(self, batch, sid):
                         NotImplementedError(f"Unknown noise type {self.noise_type}!")
                     noised_coord = _clean_coord.clone().detach()
                     noised_coord[coord_mask] += noise_on_coord
-                    batch["coord_mask"] = torch.tensor(
-                        coord_mask, dtype=torch.bool, device=env.PREPROCESS_DEVICE
-                    )
+                    batch["coord_mask"] = torch.tensor(coord_mask, dtype=torch.bool)
                 else:
                     noised_coord = _clean_coord
                     batch["coord_mask"] = torch.tensor(
-                        np.zeros_like(coord_mask, dtype=bool),
-                        dtype=torch.bool,
-                        device=env.PREPROCESS_DEVICE,
+                        np.zeros_like(coord_mask, dtype=bool), dtype=torch.bool
                     )
 
                 # add mask for type
                 if self.mask_type:
                     masked_type = clean_type.clone().detach()
                     masked_type[type_mask] = self.mask_type_idx
-                    batch["type_mask"] = torch.tensor(
-                        type_mask, dtype=torch.bool, device=env.PREPROCESS_DEVICE
-                    )
+                    batch["type_mask"] = torch.tensor(type_mask, dtype=torch.bool)
                 else:
                     masked_type = clean_type
                     batch["type_mask"] = torch.tensor(
-                        np.zeros_like(type_mask, dtype=bool),
-                        dtype=torch.bool,
-                        device=env.PREPROCESS_DEVICE,
+                        np.zeros_like(type_mask, dtype=bool), dtype=torch.bool
                     )
                 if self.pbc:
                     _coord = normalize_coord(noised_coord, region, nloc)
@@ -803,7 +781,7 @@ def __len__(self):
     def __getitem__(self, index):
         """Get a frame from the selected system."""
         b_data = self._data_system._get_item(index)
-        b_data["natoms"] = torch.tensor(self._natoms_vec, device=env.PREPROCESS_DEVICE)
+        b_data["natoms"] = torch.tensor(self._natoms_vec)
         return b_data
 
 
@@ -878,9 +856,7 @@ def __getitem__(self, index=None):
         if index is None:
             index = dp_random.choice(np.arange(self.nsystems), p=self.probs)
         b_data = self._data_systems[index].get_batch(self._batch_size)
-        b_data["natoms"] = torch.tensor(
-            self._natoms_vec[index], device=env.PREPROCESS_DEVICE
-        )
+        b_data["natoms"] = torch.tensor(self._natoms_vec[index])
         batch_size = b_data["coord"].shape[0]
         b_data["natoms"] = b_data["natoms"].unsqueeze(0).expand(batch_size, -1)
         return b_data
@@ -891,9 +867,7 @@ def get_training_batch(self, index=None):
         if index is None:
             index = dp_random.choice(np.arange(self.nsystems), p=self.probs)
         b_data = self._data_systems[index].get_batch_for_train(self._batch_size)
-        b_data["natoms"] = torch.tensor(
-            self._natoms_vec[index], device=env.PREPROCESS_DEVICE
-        )
+        b_data["natoms"] = torch.tensor(self._natoms_vec[index])
         batch_size = b_data["coord"].shape[0]
         b_data["natoms"] = b_data["natoms"].unsqueeze(0).expand(batch_size, -1)
         return b_data
@@ -902,10 +876,7 @@ def get_batch(self, sys_idx=None):
         """TF-compatible batch for testing."""
         pt_batch = self[sys_idx]
         np_batch = {}
-        for key in ["coord", "box", "force", "energy", "virial"]:
-            if key in pt_batch.keys():
-                np_batch[key] = pt_batch[key].cpu().numpy()
-        for key in ["atype", "natoms"]:
+        for key in ["coord", "box", "force", "energy", "virial", "atype", "natoms"]:
             if key in pt_batch.keys():
                 np_batch[key] = pt_batch[key].cpu().numpy()
         batch_size = pt_batch["coord"].shape[0]
diff --git a/deepmd/pt/utils/env.py b/deepmd/pt/utils/env.py
index 6fa72943c7..559dba0167 100644
--- a/deepmd/pt/utils/env.py
+++ b/deepmd/pt/utils/env.py
@@ -24,11 +24,6 @@
 else:
     DEVICE = torch.device(f"cuda:{LOCAL_RANK}")
 
-if os.environ.get("PREPROCESS_DEVICE") == "gpu":
-    PREPROCESS_DEVICE = torch.device(f"cuda:{LOCAL_RANK}")
-else:
-    PREPROCESS_DEVICE = torch.device("cpu")
-
 JIT = False
 CACHE_PER_SYS = 5  # keep at most so many sets per sys in memory
 ENERGY_BIAS_TRAINABLE = True
diff --git a/deepmd/pt/utils/preprocess.py b/deepmd/pt/utils/preprocess.py
index 463ac112ad..18c798138e 100644
--- a/deepmd/pt/utils/preprocess.py
+++ b/deepmd/pt/utils/preprocess.py
@@ -99,7 +99,7 @@ def build_inside_clist(coord, region: Region3D, ncell):
     cell_offset[cell_offset < 0] = 0
     delta = cell_offset - ncell
     a2c = compute_serial_cid(cell_offset, ncell)  # cell id of atoms
-    arange = torch.arange(0, loc_ncell, 1, device=env.PREPROCESS_DEVICE)
+    arange = torch.arange(0, loc_ncell, 1)
     cellid = a2c == arange.unsqueeze(-1)  # one hot cellid
     c2a = cellid.nonzero()
     lst = []
@@ -131,18 +131,12 @@ def append_neighbors(coord, region: Region3D, atype, rcut: float):
 
     # add ghost atoms
     a2c, c2a = build_inside_clist(coord, region, ncell)
-    xi = torch.arange(-ngcell[0], ncell[0] + ngcell[0], 1, device=env.PREPROCESS_DEVICE)
-    yi = torch.arange(-ngcell[1], ncell[1] + ngcell[1], 1, device=env.PREPROCESS_DEVICE)
-    zi = torch.arange(-ngcell[2], ncell[2] + ngcell[2], 1, device=env.PREPROCESS_DEVICE)
-    xyz = xi.view(-1, 1, 1, 1) * torch.tensor(
-        [1, 0, 0], dtype=torch.long, device=env.PREPROCESS_DEVICE
-    )
-    xyz = xyz + yi.view(1, -1, 1, 1) * torch.tensor(
-        [0, 1, 0], dtype=torch.long, device=env.PREPROCESS_DEVICE
-    )
-    xyz = xyz + zi.view(1, 1, -1, 1) * torch.tensor(
-        [0, 0, 1], dtype=torch.long, device=env.PREPROCESS_DEVICE
-    )
+    xi = torch.arange(-ngcell[0], ncell[0] + ngcell[0], 1)
+    yi = torch.arange(-ngcell[1], ncell[1] + ngcell[1], 1)
+    zi = torch.arange(-ngcell[2], ncell[2] + ngcell[2], 1)
+    xyz = xi.view(-1, 1, 1, 1) * torch.tensor([1, 0, 0], dtype=torch.long)
+    xyz = xyz + yi.view(1, -1, 1, 1) * torch.tensor([0, 1, 0], dtype=torch.long)
+    xyz = xyz + zi.view(1, 1, -1, 1) * torch.tensor([0, 0, 1], dtype=torch.long)
     xyz = xyz.view(-1, 3)
     mask_a = (xyz >= 0).all(dim=-1)
     mask_b = (xyz < ncell).all(dim=-1)
@@ -165,9 +159,7 @@ def append_neighbors(coord, region: Region3D, atype, rcut: float):
     merged_coord = torch.cat([coord, tmp_coord])
     merged_coord_shift = torch.cat([torch.zeros_like(coord), coord_shift[tmp]])
     merged_atype = torch.cat([atype, tmp_atype])
-    merged_mapping = torch.cat(
-        [torch.arange(atype.numel(), device=env.PREPROCESS_DEVICE), aid]
-    )
+    merged_mapping = torch.cat([torch.arange(atype.numel()), aid])
     return merged_coord_shift, merged_atype, merged_mapping
 
 
@@ -188,22 +180,16 @@ def build_neighbor_list(
     distance = coord_l - coord_r
     distance = torch.linalg.norm(distance, dim=-1)
     DISTANCE_INF = distance.max().detach() + rcut
-    distance[:nloc, :nloc] += (
-        torch.eye(nloc, dtype=torch.bool, device=env.PREPROCESS_DEVICE) * DISTANCE_INF
-    )
+    distance[:nloc, :nloc] += torch.eye(nloc, dtype=torch.bool) * DISTANCE_INF
     if min_check:
         if distance.min().abs() < 1e-6:
             RuntimeError("Atom dist too close!")
     if not type_split:
         sec = sec[-1:]
     lst = []
-    nlist = torch.zeros((nloc, sec[-1].item()), device=env.PREPROCESS_DEVICE).long() - 1
-    nlist_loc = (
-        torch.zeros((nloc, sec[-1].item()), device=env.PREPROCESS_DEVICE).long() - 1
-    )
-    nlist_type = (
-        torch.zeros((nloc, sec[-1].item()), device=env.PREPROCESS_DEVICE).long() - 1
-    )
+    nlist = torch.zeros((nloc, sec[-1].item())).long() - 1
+    nlist_loc = torch.zeros((nloc, sec[-1].item())).long() - 1
+    nlist_type = torch.zeros((nloc, sec[-1].item())).long() - 1
     for i, nnei in enumerate(sec):
         if i > 0:
             nnei = nnei - sec[i - 1]
@@ -216,11 +202,8 @@ def build_neighbor_list(
             _sorted, indices = torch.topk(tmp, nnei, dim=1, largest=False)
         else:
             # when nnei > nall
-            indices = torch.zeros((nloc, nnei), device=env.PREPROCESS_DEVICE).long() - 1
-            _sorted = (
-                torch.ones((nloc, nnei), device=env.PREPROCESS_DEVICE).long()
-                * DISTANCE_INF
-            )
+            indices = torch.zeros((nloc, nnei)).long() - 1
+            _sorted = torch.ones((nloc, nnei)).long() * DISTANCE_INF
             _sorted_nnei, indices_nnei = torch.topk(
                 tmp, tmp.shape[1], dim=1, largest=False
             )
@@ -284,7 +267,7 @@ def make_env_mat(
     else:
         merged_coord_shift = torch.zeros_like(coord)
         merged_atype = atype.clone()
-        merged_mapping = torch.arange(atype.numel(), device=env.PREPROCESS_DEVICE)
+        merged_mapping = torch.arange(atype.numel())
         merged_coord = coord.clone()
 
     # build nlist
diff --git a/deepmd/pt/utils/stat.py b/deepmd/pt/utils/stat.py
index 18ee4d9abe..eec7179bcd 100644
--- a/deepmd/pt/utils/stat.py
+++ b/deepmd/pt/utils/stat.py
@@ -62,14 +62,9 @@ def make_stat_input(datasets, dataloaders, nbatches):
                         shape = torch.zeros(
                             (n_frames, extend, 3),
                             dtype=env.GLOBAL_PT_FLOAT_PRECISION,
-                            device=env.PREPROCESS_DEVICE,
                         )
                     else:
-                        shape = torch.zeros(
-                            (n_frames, extend),
-                            dtype=torch.long,
-                            device=env.PREPROCESS_DEVICE,
-                        )
+                        shape = torch.zeros((n_frames, extend), dtype=torch.long)
                     for i in range(len(item)):
                         natoms_tmp = l[i].shape[0]
                         shape[i, :natoms_tmp] = l[i]
diff --git a/source/tests/pt/test_descriptor.py b/source/tests/pt/test_descriptor.py
index da38cf007f..2dd996349b 100644
--- a/source/tests/pt/test_descriptor.py
+++ b/source/tests/pt/test_descriptor.py
@@ -18,6 +18,7 @@
 )
 from deepmd.pt.utils import (
     dp_random,
+    env,
 )
 from deepmd.pt.utils.dataset import (
     DeepmdDataSet,
@@ -112,29 +113,33 @@ def setUp(self):
 
     def test_consistency(self):
         avg_zero = torch.zeros(
-            [self.ntypes, self.nnei * 4], dtype=GLOBAL_PT_FLOAT_PRECISION
+            [self.ntypes, self.nnei * 4],
+            dtype=GLOBAL_PT_FLOAT_PRECISION,
+            device=env.DEVICE,
         )
         std_ones = torch.ones(
-            [self.ntypes, self.nnei * 4], dtype=GLOBAL_PT_FLOAT_PRECISION
+            [self.ntypes, self.nnei * 4],
+            dtype=GLOBAL_PT_FLOAT_PRECISION,
+            device=env.DEVICE,
         )
         base_d, base_force, nlist = base_se_a(
             rcut=self.rcut,
             rcut_smth=self.rcut_smth,
             sel=self.sel,
             batch=self.np_batch,
-            mean=avg_zero,
-            stddev=std_ones,
+            mean=avg_zero.detach().cpu(),
+            stddev=std_ones.detach().cpu(),
         )
 
-        pt_coord = self.pt_batch["coord"]
+        pt_coord = self.pt_batch["coord"].to(env.DEVICE)
         pt_coord.requires_grad_(True)
-        index = self.pt_batch["mapping"].unsqueeze(-1).expand(-1, -1, 3)
+        index = self.pt_batch["mapping"].unsqueeze(-1).expand(-1, -1, 3).to(env.DEVICE)
         extended_coord = torch.gather(pt_coord, dim=1, index=index)
-        extended_coord = extended_coord - self.pt_batch["shift"]
+        extended_coord = extended_coord - self.pt_batch["shift"].to(env.DEVICE)
         my_d, _, _ = prod_env_mat_se_a(
             extended_coord.to(DEVICE),
-            self.pt_batch["nlist"],
-            self.pt_batch["atype"],
+            self.pt_batch["nlist"].to(env.DEVICE),
+            self.pt_batch["atype"].to(env.DEVICE),
             avg_zero.reshape([-1, self.nnei, 4]).to(DEVICE),
             std_ones.reshape([-1, self.nnei, 4]).to(DEVICE),
             self.rcut,
diff --git a/source/tests/pt/test_descriptor_dpa1.py b/source/tests/pt/test_descriptor_dpa1.py
index 689fa7e49c..725369d68d 100644
--- a/source/tests/pt/test_descriptor_dpa1.py
+++ b/source/tests/pt/test_descriptor_dpa1.py
@@ -243,7 +243,7 @@ def test_descriptor_block(self):
         dparams["ntypes"] = ntypes
         des = DescrptBlockSeAtten(
             **dparams,
-        )
+        ).to(env.DEVICE)
         des.load_state_dict(torch.load(self.file_model_param))
         rcut = dparams["rcut"]
         nsel = dparams["sel"]
@@ -260,7 +260,7 @@ def test_descriptor_block(self):
             extended_coord, extended_atype, nloc, rcut, nsel, distinguish_types=False
         )
         # handel type_embedding
-        type_embedding = TypeEmbedNet(ntypes, 8)
+        type_embedding = TypeEmbedNet(ntypes, 8).to(env.DEVICE)
         type_embedding.load_state_dict(torch.load(self.file_type_embed))
 
         ## to save model parameters
@@ -293,7 +293,7 @@ def test_descriptor(self):
         dparams["concat_output_tebd"] = False
         des = DescrptDPA1(
             **dparams,
-        )
+        ).to(env.DEVICE)
         target_dict = des.state_dict()
         source_dict = torch.load(self.file_model_param)
         type_embd_dict = torch.load(self.file_type_embed)
@@ -337,7 +337,7 @@ def test_descriptor(self):
         dparams["concat_output_tebd"] = True
         des = DescrptDPA1(
             **dparams,
-        )
+        ).to(env.DEVICE)
         descriptor, env_mat, diff, rot_mat, sw = des(
             extended_coord,
             extended_atype,
diff --git a/source/tests/pt/test_descriptor_dpa2.py b/source/tests/pt/test_descriptor_dpa2.py
index 45c95961fe..aa6b16964e 100644
--- a/source/tests/pt/test_descriptor_dpa2.py
+++ b/source/tests/pt/test_descriptor_dpa2.py
@@ -124,7 +124,7 @@ def test_descriptor_hyb(self):
             dlist,
             ntypes,
             hybrid_mode=dparams["hybrid_mode"],
-        )
+        ).to(env.DEVICE)
         model_dict = torch.load(self.file_model_param)
         # type_embd of repformer is removed
         model_dict.pop("descriptor_list.1.type_embd.embedding.weight")
@@ -158,7 +158,7 @@ def test_descriptor_hyb(self):
             )
         nlist = torch.cat(nlist_list, -1)
         # handel type_embedding
-        type_embedding = TypeEmbedNet(ntypes, 8)
+        type_embedding = TypeEmbedNet(ntypes, 8).to(env.DEVICE)
         type_embedding.load_state_dict(torch.load(self.file_type_embed))
 
         ## to save model parameters
@@ -186,7 +186,7 @@ def test_descriptor(self):
         dparams["concat_output_tebd"] = False
         des = DescrptDPA2(
             **dparams,
-        )
+        ).to(env.DEVICE)
         target_dict = des.state_dict()
         source_dict = torch.load(self.file_model_param)
         # type_embd of repformer is removed
@@ -232,7 +232,7 @@ def test_descriptor(self):
         dparams["concat_output_tebd"] = True
         des = DescrptDPA2(
             **dparams,
-        )
+        ).to(env.DEVICE)
         descriptor, env_mat, diff, rot_mat, sw = des(
             extended_coord,
             extended_atype,
diff --git a/source/tests/pt/test_embedding_net.py b/source/tests/pt/test_embedding_net.py
index fc98ddc9f9..407f4949b5 100644
--- a/source/tests/pt/test_embedding_net.py
+++ b/source/tests/pt/test_embedding_net.py
@@ -8,6 +8,10 @@
 import tensorflow.compat.v1 as tf
 import torch
 
+from deepmd.pt.utils import (
+    env,
+)
+
 tf.disable_eager_execution()
 
 from pathlib import (
@@ -148,18 +152,22 @@ def test_consistency(self):
                     # Keep parameter value consistency between 2 implentations
                     param.data.copy_(torch.from_numpy(var))
 
-        pt_coord = self.torch_batch["coord"]
+        pt_coord = self.torch_batch["coord"].to(env.DEVICE)
         pt_coord.requires_grad_(True)
-        index = self.torch_batch["mapping"].unsqueeze(-1).expand(-1, -1, 3)
+        index = (
+            self.torch_batch["mapping"].unsqueeze(-1).expand(-1, -1, 3).to(env.DEVICE)
+        )
         extended_coord = torch.gather(pt_coord, dim=1, index=index)
-        extended_coord = extended_coord - self.torch_batch["shift"]
+        extended_coord = extended_coord - self.torch_batch["shift"].to(env.DEVICE)
         extended_atype = torch.gather(
-            self.torch_batch["atype"], dim=1, index=self.torch_batch["mapping"]
+            self.torch_batch["atype"].to(env.DEVICE),
+            dim=1,
+            index=self.torch_batch["mapping"].to(env.DEVICE),
         )
         descriptor_out, _, _, _, _ = descriptor(
             extended_coord,
             extended_atype,
-            self.torch_batch["nlist"],
+            self.torch_batch["nlist"].to(env.DEVICE),
         )
         my_embedding = descriptor_out.cpu().detach().numpy()
         fake_energy = torch.sum(descriptor_out)
diff --git a/source/tests/pt/test_fitting_net.py b/source/tests/pt/test_fitting_net.py
index ed2c428de5..e12a397347 100644
--- a/source/tests/pt/test_fitting_net.py
+++ b/source/tests/pt/test_fitting_net.py
@@ -11,6 +11,9 @@
 from deepmd.pt.model.task import (
     EnergyFittingNet,
 )
+from deepmd.pt.utils import (
+    env,
+)
 from deepmd.pt.utils.env import (
     GLOBAL_NP_FLOAT_PRECISION,
 )
@@ -105,7 +108,7 @@ def test_consistency(self):
             neuron=self.n_neuron,
             bias_atom_e=self.dp_fn.bias_atom_e,
             distinguish_types=True,
-        )
+        ).to(env.DEVICE)
         for name, param in my_fn.named_parameters():
             matched = re.match(
                 "filter_layers\.networks\.(\d).layers\.(\d)\.([a-z]+)", name
@@ -129,9 +132,9 @@ def test_consistency(self):
         embedding = torch.from_numpy(self.embedding)
         embedding = embedding.view(4, -1, self.embedding_width)
         atype = torch.from_numpy(self.atype)
-        ret = my_fn(embedding, atype)
+        ret = my_fn(embedding.to(env.DEVICE), atype.to(env.DEVICE))
         my_energy = ret["energy"]
-        my_energy = my_energy.detach()
+        my_energy = my_energy.detach().cpu()
         np.testing.assert_allclose(dp_energy, my_energy.numpy().reshape([-1]))
 
 
diff --git a/source/tests/pt/test_mlp.py b/source/tests/pt/test_mlp.py
index c06047b2a5..26f0041bf9 100644
--- a/source/tests/pt/test_mlp.py
+++ b/source/tests/pt/test_mlp.py
@@ -5,6 +5,9 @@
 import numpy as np
 import torch
 
+from deepmd.pt.utils import (
+    env,
+)
 from deepmd.pt.utils.env import (
     PRECISION_DICT,
 )
@@ -104,23 +107,27 @@ def test_match_native_layer(
                 inp_shap = ashp + inp_shap
             rtol, atol = get_tols(prec)
             dtype = PRECISION_DICT[prec]
-            xx = torch.arange(np.prod(inp_shap), dtype=dtype).view(inp_shap)
+            xx = torch.arange(np.prod(inp_shap), dtype=dtype, device=env.DEVICE).view(
+                inp_shap
+            )
             # def mlp layer
-            ml = MLPLayer(ninp, nout, bias, ut, ac, resnet, precision=prec)
+            ml = MLPLayer(ninp, nout, bias, ut, ac, resnet, precision=prec).to(
+                env.DEVICE
+            )
             # check consistency
             nl = NativeLayer.deserialize(ml.serialize())
             np.testing.assert_allclose(
-                ml.forward(xx).detach().numpy(),
-                nl.call(xx.detach().numpy()),
+                ml.forward(xx).detach().cpu().numpy(),
+                nl.call(xx.detach().cpu().numpy()),
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"(i={ninp}, o={nout}) bias={bias} use_dt={ut} act={ac} resnet={resnet} prec={prec}",
             )
             # check self-consistency
-            ml1 = MLPLayer.deserialize(ml.serialize())
+            ml1 = MLPLayer.deserialize(ml.serialize()).to(env.DEVICE)
             np.testing.assert_allclose(
-                ml.forward(xx).detach().numpy(),
-                ml1.forward(xx).detach().numpy(),
+                ml.forward(xx).detach().cpu().numpy(),
+                ml1.forward(xx).detach().cpu().numpy(),
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"(i={ninp}, o={nout}) bias={bias} use_dt={ut} act={ac} resnet={resnet} prec={prec}",
@@ -157,7 +164,9 @@ def test_match_native_net(
                 inp_shap = ashp + inp_shap
             rtol, atol = get_tols(prec)
             dtype = PRECISION_DICT[prec]
-            xx = torch.arange(np.prod(inp_shap), dtype=dtype).view(inp_shap)
+            xx = torch.arange(np.prod(inp_shap), dtype=dtype, device=env.DEVICE).view(
+                inp_shap
+            )
             # def MLP
             layers = []
             for ii in range(1, len(ndims)):
@@ -166,21 +175,21 @@ def test_match_native_net(
                         ndims[ii - 1], ndims[ii], bias, ut, ac, resnet, precision=prec
                     ).serialize()
                 )
-            ml = MLP(layers)
+            ml = MLP(layers).to(env.DEVICE)
             # check consistency
             nl = NativeNet.deserialize(ml.serialize())
             np.testing.assert_allclose(
-                ml.forward(xx).detach().numpy(),
-                nl.call(xx.detach().numpy()),
+                ml.forward(xx).detach().cpu().numpy(),
+                nl.call(xx.detach().cpu().numpy()),
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"net={ndims} bias={bias} use_dt={ut} act={ac} resnet={resnet} prec={prec}",
             )
             # check self-consistency
-            ml1 = MLP.deserialize(ml.serialize())
+            ml1 = MLP.deserialize(ml.serialize()).to(env.DEVICE)
             np.testing.assert_allclose(
-                ml.forward(xx).detach().numpy(),
-                ml1.forward(xx).detach().numpy(),
+                ml.forward(xx).detach().cpu().numpy(),
+                ml1.forward(xx).detach().cpu().numpy(),
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"net={ndims} bias={bias} use_dt={ut} act={ac} resnet={resnet} prec={prec}",
@@ -219,23 +228,23 @@ def test_match_embedding_net(
             # input
             rtol, atol = get_tols(prec)
             dtype = PRECISION_DICT[prec]
-            xx = torch.arange(idim, dtype=dtype)
+            xx = torch.arange(idim, dtype=dtype, device=env.DEVICE)
             # def MLP
-            ml = EmbeddingNet(idim, nn, act, idt, prec)
+            ml = EmbeddingNet(idim, nn, act, idt, prec).to(env.DEVICE)
             # check consistency
             nl = DPEmbeddingNet.deserialize(ml.serialize())
             np.testing.assert_allclose(
-                ml.forward(xx).detach().numpy(),
-                nl.call(xx.detach().numpy()),
+                ml.forward(xx).detach().cpu().numpy(),
+                nl.call(xx.detach().cpu().numpy()),
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"idim={idim} nn={nn} use_dt={idt} act={act} prec={prec}",
             )
             # check self-consistency
-            ml1 = EmbeddingNet.deserialize(ml.serialize())
+            ml1 = EmbeddingNet.deserialize(ml.serialize()).to(env.DEVICE)
             np.testing.assert_allclose(
-                ml.forward(xx).detach().numpy(),
-                ml1.forward(xx).detach().numpy(),
+                ml.forward(xx).detach().cpu().numpy(),
+                ml1.forward(xx).detach().cpu().numpy(),
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"idim={idim} nn={nn} use_dt={idt} act={act} prec={prec}",
@@ -246,8 +255,8 @@ def test_jit(
     ):
         for idim, nn, act, idt, prec in self.test_cases:
             # def MLP
-            ml = EmbeddingNet(idim, nn, act, idt, prec)
-            ml1 = EmbeddingNet.deserialize(ml.serialize())
+            ml = EmbeddingNet(idim, nn, act, idt, prec).to(env.DEVICE)
+            ml1 = EmbeddingNet.deserialize(ml.serialize()).to(env.DEVICE)
             model = torch.jit.script(ml)
             model = torch.jit.script(ml1)
 
@@ -272,7 +281,7 @@ def test_match_fitting_net(
             # input
             rtol, atol = get_tols(prec)
             dtype = PRECISION_DICT[prec]
-            xx = torch.arange(idim, dtype=dtype)
+            xx = torch.arange(idim, dtype=dtype, device=env.DEVICE)
             # def MLP
             ml = FittingNet(
                 idim,
@@ -282,21 +291,21 @@ def test_match_fitting_net(
                 resnet_dt=idt,
                 precision=prec,
                 bias_out=ob,
-            )
+            ).to(env.DEVICE)
             # check consistency
             nl = DPFittingNet.deserialize(ml.serialize())
             np.testing.assert_allclose(
-                ml.forward(xx).detach().numpy(),
-                nl.call(xx.detach().numpy()),
+                ml.forward(xx).detach().cpu().numpy(),
+                nl.call(xx.detach().cpu().numpy()),
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"idim={idim} nn={nn} use_dt={idt} act={act} prec={prec}",
             )
             # check self-consistency
-            ml1 = FittingNet.deserialize(ml.serialize())
+            ml1 = FittingNet.deserialize(ml.serialize()).to(env.DEVICE)
             np.testing.assert_allclose(
-                ml.forward(xx).detach().numpy(),
-                ml1.forward(xx).detach().numpy(),
+                ml.forward(xx).detach().cpu().numpy(),
+                ml1.forward(xx).detach().cpu().numpy(),
                 rtol=rtol,
                 atol=atol,
                 err_msg=f"idim={idim} nn={nn} use_dt={idt} act={act} prec={prec}",
@@ -315,7 +324,7 @@ def test_jit(
                 resnet_dt=idt,
                 precision=prec,
                 bias_out=ob,
-            )
-            ml1 = FittingNet.deserialize(ml.serialize())
+            ).to(env.DEVICE)
+            ml1 = FittingNet.deserialize(ml.serialize()).to(env.DEVICE)
             model = torch.jit.script(ml)
             model = torch.jit.script(ml1)
diff --git a/source/tests/pt/test_model.py b/source/tests/pt/test_model.py
index c6595e6471..e87a53969c 100644
--- a/source/tests/pt/test_model.py
+++ b/source/tests/pt/test_model.py
@@ -7,6 +7,10 @@
 import tensorflow.compat.v1 as tf
 import torch
 
+from deepmd.pt.utils import (
+    env,
+)
+
 tf.disable_eager_execution()
 
 from pathlib import (
@@ -340,10 +344,16 @@ def test_consistency(self):
             batch["natoms_vec"], device=batch["coord"].device
         ).unsqueeze(0)
         model_predict = my_model(
-            batch["coord"], batch["atype"], batch["box"], do_atomic_virial=True
+            batch["coord"].to(env.DEVICE),
+            batch["atype"].to(env.DEVICE),
+            batch["box"].to(env.DEVICE),
+            do_atomic_virial=True,
         )
         model_predict_1 = my_model(
-            batch["coord"], batch["atype"], batch["box"], do_atomic_virial=False
+            batch["coord"].to(env.DEVICE),
+            batch["atype"].to(env.DEVICE),
+            batch["box"].to(env.DEVICE),
+            do_atomic_virial=False,
         )
         p_energy, p_force, p_virial, p_atomic_virial = (
             model_predict["energy"],
@@ -357,8 +367,8 @@ def test_consistency(self):
             "force": p_force,
         }
         label = {
-            "energy": batch["energy"],
-            "force": batch["force"],
+            "energy": batch["energy"].to(env.DEVICE),
+            "force": batch["force"].to(env.DEVICE),
         }
         loss, _ = my_loss(model_pred, label, int(batch["natoms"][0, 0]), cur_lr)
         np.testing.assert_allclose(
diff --git a/source/tests/pt/test_saveload_dpa1.py b/source/tests/pt/test_saveload_dpa1.py
index d1043f7029..1b4c41a204 100644
--- a/source/tests/pt/test_saveload_dpa1.py
+++ b/source/tests/pt/test_saveload_dpa1.py
@@ -129,13 +129,13 @@ def get_data(self):
         input_dict = {}
         for item in ["coord", "atype", "box"]:
             if item in batch_data:
-                input_dict[item] = batch_data[item]
+                input_dict[item] = batch_data[item].to(env.DEVICE)
             else:
                 input_dict[item] = None
         label_dict = {}
         for item in ["energy", "force", "virial"]:
             if item in batch_data:
-                label_dict[item] = batch_data[item]
+                label_dict[item] = batch_data[item].to(env.DEVICE)
         return input_dict, label_dict
 
     def test_saveload(self):
diff --git a/source/tests/pt/test_saveload_se_e2_a.py b/source/tests/pt/test_saveload_se_e2_a.py
index 95d7f97a88..7f8364a16f 100644
--- a/source/tests/pt/test_saveload_se_e2_a.py
+++ b/source/tests/pt/test_saveload_se_e2_a.py
@@ -123,13 +123,13 @@ def get_data(self):
         input_dict = {}
         for item in ["coord", "atype", "box"]:
             if item in batch_data:
-                input_dict[item] = batch_data[item]
+                input_dict[item] = batch_data[item].to(env.DEVICE)
             else:
                 input_dict[item] = None
         label_dict = {}
         for item in ["energy", "force", "virial"]:
             if item in batch_data:
-                label_dict[item] = batch_data[item]
+                label_dict[item] = batch_data[item].to(env.DEVICE)
         return input_dict, label_dict
 
     def test_saveload(self):