Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

设备不一致问题 #4

Open
abandol opened this issue May 29, 2024 · 2 comments
Open

设备不一致问题 #4

abandol opened this issue May 29, 2024 · 2 comments

Comments

@abandol
Copy link

abandol commented May 29, 2024

您好,MLP还是有变量不在同一设备的问题。

@Dragon1573
Copy link

@GDUT-ZJJ 看来这个问题仍然存在,之前在 百度贴吧 测试时都没注意到这里出错了🤣


错误堆栈
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[20], [line 11](vscode-notebook-cell:?execution_count=20&line=11)
      [9](vscode-notebook-cell:?execution_count=20&line=9) MAE, RMSE, RE = [], [], []
     [10](vscode-notebook-cell:?execution_count=20&line=10) for seed in range(10):
---> [11](vscode-notebook-cell:?execution_count=20&line=11)     re_list, mae_list, rmse_list, _ = tain(
     [12](vscode-notebook-cell:?execution_count=20&line=12)         LR, feature_size, hidden_size, weight_decay, window_size, EPOCH, seed
     [13](vscode-notebook-cell:?execution_count=20&line=13)     )
     [14](vscode-notebook-cell:?execution_count=20&line=14)     RE.append(np.mean(np.array(re_list)))
     [15](vscode-notebook-cell:?execution_count=20&line=15)     MAE.append(np.mean(np.array(mae_list)))

Cell In[19], [line 38](vscode-notebook-cell:?execution_count=19&line=38)
     [35](vscode-notebook-cell:?execution_count=19&line=35) y = np.reshape(train_y[:, -1] / Rated_Capacity, (-1, 1)).astype(np.float32)
     [37](vscode-notebook-cell:?execution_count=19&line=37) X, y = torch.from_numpy(X).to(device), torch.from_numpy(y).to(device)
---> [38](vscode-notebook-cell:?execution_count=19&line=38) output = model(X)
     [39](vscode-notebook-cell:?execution_count=19&line=39) loss = criterion(output, y)
     [40](vscode-notebook-cell:?execution_count=19&line=40) optimizer.zero_grad()  # clear gradients for this training step

File d:\Repository\NASA\.venv\lib\site-packages\torch\nn\modules\module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
   [1734](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1734)     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   [1735](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1735) else:
-> [1736](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1736)     return self._call_impl(*args, **kwargs)

File d:\Repository\NASA\.venv\lib\site-packages\torch\nn\modules\module.py:1747, in Module._call_impl(self, *args, **kwargs)
   [1742](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1742) # If we don't have any hooks, we want to skip the rest of the logic in
   [1743](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1743) # this function, and just call forward.
   [1744](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1744) if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   [1745](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1745)         or _global_backward_pre_hooks or _global_backward_hooks
   [1746](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1746)         or _global_forward_hooks or _global_forward_pre_hooks):
-> [1747](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1747)     return forward_call(*args, **kwargs)
   [1749](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1749) result = None
   [1750](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1750) called_always_called_hooks = set()

Cell In[18], [line 17](vscode-notebook-cell:?execution_count=18&line=17)
     [15](vscode-notebook-cell:?execution_count=18&line=15) out = self.layer0(x)
     [16](vscode-notebook-cell:?execution_count=18&line=16) for layer in self.layers:
---> [17](vscode-notebook-cell:?execution_count=18&line=17)     out = layer(out)
     [18](vscode-notebook-cell:?execution_count=18&line=18) out = self.linear(out)
     [19](vscode-notebook-cell:?execution_count=18&line=19) return out

File d:\Repository\NASA\.venv\lib\site-packages\torch\nn\modules\module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
   [1734](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1734)     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   [1735](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1735) else:
-> [1736](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1736)     return self._call_impl(*args, **kwargs)

File d:\Repository\NASA\.venv\lib\site-packages\torch\nn\modules\module.py:1747, in Module._call_impl(self, *args, **kwargs)
   [1742](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1742) # If we don't have any hooks, we want to skip the rest of the logic in
   [1743](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1743) # this function, and just call forward.
   [1744](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1744) if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   [1745](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1745)         or _global_backward_pre_hooks or _global_backward_hooks
   [1746](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1746)         or _global_forward_hooks or _global_forward_pre_hooks):
-> [1747](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1747)     return forward_call(*args, **kwargs)
   [1749](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1749) result = None
   [1750](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1750) called_always_called_hooks = set()

File d:\Repository\NASA\.venv\lib\site-packages\torch\nn\modules\container.py:250, in Sequential.forward(self, input)
    [248](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/container.py:248) def forward(self, input):
    [249](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/container.py:249)     for module in self:
--> [250](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/container.py:250)         input = module(input)
    [251](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/container.py:251)     return input

File d:\Repository\NASA\.venv\lib\site-packages\torch\nn\modules\module.py:1736, in Module._wrapped_call_impl(self, *args, **kwargs)
   [1734](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1734)     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   [1735](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1735) else:
-> [1736](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1736)     return self._call_impl(*args, **kwargs)

File d:\Repository\NASA\.venv\lib\site-packages\torch\nn\modules\module.py:1747, in Module._call_impl(self, *args, **kwargs)
   [1742](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1742) # If we don't have any hooks, we want to skip the rest of the logic in
   [1743](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1743) # this function, and just call forward.
   [1744](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1744) if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   [1745](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1745)         or _global_backward_pre_hooks or _global_backward_hooks
   [1746](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1746)         or _global_forward_hooks or _global_forward_pre_hooks):
-> [1747](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1747)     return forward_call(*args, **kwargs)
   [1749](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1749) result = None
   [1750](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/module.py:1750) called_always_called_hooks = set()

File d:\Repository\NASA\.venv\lib\site-packages\torch\nn\modules\linear.py:125, in Linear.forward(self, input)
    [124](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/linear.py:124) def forward(self, input: Tensor) -> Tensor:
--> [125](file:///D:/Repository/NASA/.venv/lib/site-packages/torch/nn/modules/linear.py:125)     return F.linear(input, self.weight, self.bias)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)
运行环境
21:57:18 D:\...\NASA  [main ≡ +5 ~1 -0 !] 0ms pwsh> uv run python -c 'from sys import version; print(version)'
3.9.20 (main, Oct 16 2024, 00:31:33) [MSC v.1929 64 bit (AMD64)]

21:57:24 D:\...\NASA  [main ≡ +5 ~1 -0 !] 51ms pwsh> uv tree -d 1
Resolved 79 packages in 0.67ms
nasa v0.1.0
├── ipykernel v6.29.5
├── matplotlib v3.9.2
├── scikit-learn v1.5.2
├── scipy v1.13.1
├── torch v2.5.1+cu124
├── torchaudio v2.5.1+cu124
├── torchvision v0.20.1+cu124
├── mypy v1.13.0 (group: dev)
└── ruff v0.7.4 (group: dev)

21:58:20 D:\...\NASA  [main ≡ +5 ~1 -0 !] 0ms pwsh> systeminfo.exe

OS Name:                   Microsoft Windows 11 专业工作站版
OS Version:                10.0.22631 N/A Build 22631
OS Manufacturer:           Microsoft Corporation
OS Configuration:          Standalone Workstation
OS Build Type:             Multiprocessor Free
System Manufacturer:       ASUS
System Model:              System Product Name
System Type:               x64-based PC

22:00:19 D:\...\NASA  [main ≡ +5 ~1 -0 !] 0ms pwsh> nvidia-smi.exe
Thu Nov 21 22:00:21 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.03                 Driver Version: 566.03         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 2070      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   55C    P0             29W /  131W |    1286MiB /   8192MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

22:00:34 D:\...\NASA  [main ≡ +5 ~1 -0 !] 0ms pwsh> nvcc.exe --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Sep_12_02:55:00_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.6, V12.6.77
Build cuda_12.6.r12.6/compiler.34841621_0

# CuDNN 使用 v9.5 版本

@Dragon1573
Copy link

🎉 解决方案已找到

@abandol @GDUT-ZJJ @XiuzeZhou 👋🏼

Caution

每次运行 MLP.ipynb 前,请 清空所有单元格输出重启内核 ,以此保证您的单元格编号与下方描述一致。

需要修改以下部分:

# 这是 MLP.ipynb 中的第8个 Python 单元格
# 对应编号 In [8]

class Net(nn.Module):
    def __init__(self, feature_size=8, hidden_size=[16, 8]):
        super(Net, self).__init__()
        self.feature_size, self.hidden_size = feature_size, hidden_size
        # 主要改动从下面开始,所有 PyTorch Tensor 或 Module 需要显式移动到目标设备上
        # 否则设备将默认位于 CPU 上,疑似 @XiuzeZhou 未使用 CUDA 设备测试
        self.layer0 = nn.Linear(self.feature_size, self.hidden_size[0]).to(device)
        self.layers = [
            nn.Sequential(
                nn.Linear(self.hidden_size[i], self.hidden_size[i + 1]).to(device),
                nn.ReLU().to(device),
            ).to(device)
            for i in range(len(self.hidden_size) - 1)
        ]
        self.linear = nn.Linear(self.hidden_size[-1], 1).to(device)
        # 核心改动区域结束

    def forward(self, x):
        out = self.layer0(x)
        for layer in self.layers:
            out = layer(out)
        out = self.linear(out)
        return out
# 这是 MLP.ipynb 的第9个 Python 单元格
# 对应编号 In [9]

def tain(
    LR=0.01,
    feature_size=8,
    hidden_size=[16, 8],
    weight_decay=0.0,
    window_size=8,
    EPOCH=1000,
    seed=0,
):
    mae_list, rmse_list, re_list = [], [], []
    result_list = []
    for i in range(4):
        name = Battery_list[i]
        train_x, train_y, train_data, test_data = get_train_test(
            Battery, name, window_size
        )
        train_size = len(train_x)
        print("sample size: {}".format(train_size))

        setup_seed(seed)
        # 核心改动从此处开始
        model = Net(feature_size=feature_size, hidden_size=hidden_size).to(device)

        optimizer = torch.optim.Adam(
            model.parameters(), lr=LR, weight_decay=weight_decay
        )
        criterion = nn.MSELoss().to(device)
        # 核心改动结束

        test_x = train_data.copy()
        loss_list, y_ = [0], []
        for epoch in range(EPOCH):
            X = np.reshape(train_x / Rated_Capacity, (-1, feature_size)).astype(
                np.float32
            )
            y = np.reshape(train_y[:, -1] / Rated_Capacity, (-1, 1)).astype(np.float32)

            X, y = torch.from_numpy(X).to(device), torch.from_numpy(y).to(device)
            output = model(X)
            loss = criterion(output, y)
            optimizer.zero_grad()  # clear gradients for this training step
            loss.backward()  # backpropagation, compute gradients
            optimizer.step()  # apply gradients

            if (epoch + 1) % 100 == 0:
                test_x = train_data.copy()  # 每100次重新预测一次
                point_list = []
                while (len(test_x) - len(train_data)) < len(test_data):
                    x = np.reshape(
                        np.array(test_x[-feature_size:]) / Rated_Capacity,
                        (-1, feature_size),
                    ).astype(np.float32)
                    x = torch.from_numpy(x).to(device)
                    pred = model(
                        x
                    )  # 测试集 模型预测#pred shape为(batch_size=1, feature_size=1)
                    # 核心改动从此处开始
                    # 变量 model 和 x 均位于目标设备(GPU)上,需要将其取回 CPU
                    # 才能对其调用 .data 方法并进一步 .numpy() 转换为 NumPy 数组
                    next_point = pred.cpu().data.numpy()[0, 0] * Rated_Capacity
                    # 核心改动结束
                    test_x.append(next_point)  # 测试值加入原来序列用来继续预测下一个点
                    point_list.append(next_point)  # 保存输出序列最后一个点的预测值
                y_.append(point_list)  # 保存本次预测所有的预测值
                loss_list.append(loss)
                mae, rmse = evaluation(y_test=test_data, y_predict=y_[-1])
                re = relative_error(
                    y_test=test_data, y_predict=y_[-1], threshold=Rated_Capacity * 0.7
                )
                print(
                    "epoch:{:<2d} | loss:{:<6.4f} | MAE:{:<6.4f} | RMSE:{:<6.4f} | RE:{:<6.4f}".format(
                        epoch, loss, mae, rmse, re
                    )
                )
            if (len(loss_list) > 1) and (abs(loss_list[-2] - loss_list[-1]) < 1e-5):
                break

        mae, rmse = evaluation(y_test=test_data, y_predict=y_[-1])
        re = relative_error(
            y_test=test_data, y_predict=y_[-1], threshold=Rated_Capacity * 0.7
        )
        mae_list.append(mae)
        rmse_list.append(rmse)
        re_list.append(re)
        result_list.append(y_[-1])
    return re_list, mae_list, rmse_list, result_list

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants