diff --git a/.github/workflows/build-dev.yml b/.github/workflows/build-dev.yml index 10b34f5c1eb..84d63d7331f 100644 --- a/.github/workflows/build-dev.yml +++ b/.github/workflows/build-dev.yml @@ -45,7 +45,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: 3.7 - name: Install Python dependencies run: | diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2c5f00fdbb2..628b5e04efd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -21,7 +21,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.9.18 + python-version: 3.7.17 - name: Cache Python packages id: cache-python @@ -78,7 +78,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.9.18 + python-version: 3.7.17 - name: Get cached Python packages id: cache-python diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml index 8a4be675736..88b24303c39 100644 --- a/.github/workflows/deploy-dev.yml +++ b/.github/workflows/deploy-dev.yml @@ -42,7 +42,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: 3.7 - name: Install Python dependencies run: | diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 1a7dcbb9a48..c85fa43dd88 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -21,7 +21,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.9.18 + python-version: 3.7.17 - name: Cache Python packages id: cache-python @@ -68,7 +68,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.9.18 + python-version: 3.7.17 - name: Get cached Python packages id: cache-python diff --git a/requirements.txt b/requirements.txt index 9e3259cca2a..e8c44ca929e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Cache key for GitHub Actions: v1.13.2 -sphinx==5.0.2 +sphinx==4.3.2 sphinx-autobuild sphinx-intl sphinxcontrib-mermaid==0.7.1 @@ -10,10 +10,12 @@ sphinx-tabs sphinx-togglebutton sphinx-panels sphinx-remove-toctrees -pydata-sphinx-theme==0.14.4 jieba ipython recommonmark nbsphinx beautifulsoup4 breathe + +# Third party +third_party/pydata-sphinx-theme diff --git a/source/reference/comparison/adaptive_avg_pool2d.rst b/source/reference/comparison/adaptive_avg_pool2d.rst index fc539ac4509..a1724492fa5 100644 --- a/source/reference/comparison/adaptive_avg_pool2d.rst +++ b/source/reference/comparison/adaptive_avg_pool2d.rst @@ -36,7 +36,7 @@ AdaptiveAvgPool2d 差异对比 -------- 输入张量的形状 -~~~~~~~~~~~~~~~ +~~~~~~~~~~~~ PyTorch 支持 NCHW 或者 CHW 的输入,MegEngine 支持 NCHW 的输入。 diff --git a/source/reference/comparison/adaptive_max_pool2d.rst b/source/reference/comparison/adaptive_max_pool2d.rst index 97981d305a3..a4d617aa1df 100644 --- a/source/reference/comparison/adaptive_max_pool2d.rst +++ b/source/reference/comparison/adaptive_max_pool2d.rst @@ -1,8 +1,8 @@ .. _comparison-adaptive_max_pool2d: -=========================== +========================= AdaptiveMaxPool2d 差异对比 -=========================== +========================= .. panels:: @@ -37,15 +37,14 @@ AdaptiveMaxPool2d 差异对比 -------- 输入张量的形状 -~~~~~~~~~~~~~~~ - +~~~~~~~~~~~~ PyTorch 支持 NCHW 或者 CHW 的输入,MegEngine 支持 NCHW 的输入。 参数差异 -------- return_indices 参数 -~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~ PyTorch 中包含 ``return_indices`` 参数,MegEngine 无此参数,该参数是一个布尔值,用于指定是否返回输出张量的最大值位置的索引。该参数设置为 True 时,函数会返回一个元组的两部分:输出张量和最大值位置的索引。其中,输出张量是经过自适应最大池化操作后的结果,而最大值位置的索引是一个二维数组,用于表示在每个输出区域内最大值的位置。 diff --git a/source/reference/comparison/conv_transpose3d.rst b/source/reference/comparison/conv_transpose3d.rst index 46c31fc461c..8bbde5f2eeb 100644 --- a/source/reference/comparison/conv_transpose3d.rst +++ b/source/reference/comparison/conv_transpose3d.rst @@ -59,8 +59,8 @@ padding ~~~~~~~~~~~~ PyTorch padding 可以是单个数字或元组,MegEngine padding 仅支持数值填充 0. -compute_mode 参数 -~~~~~~~~~~~~~~~~~~~ + compute_mode 参数 +~~~~~~~~~~~~~~~~~ MegEngine 中包含 ``compute_mode`` 参数,PyTorch 中无此参数,该参数用于指定计算模式,当设置 “default” 时, 不会对中间结果的精度有特殊要求。当设置 “float32” 时, “float32” 将被用作中间结果的累加器, 但是只有当输入和输出的 dtype 是 float16 时有效。 conv_mode 参数 diff --git a/source/reference/comparison/embedding.rst b/source/reference/comparison/embedding.rst index 4bb214a9e62..c867a6d1b5a 100644 --- a/source/reference/comparison/embedding.rst +++ b/source/reference/comparison/embedding.rst @@ -70,7 +70,7 @@ sparse PyTorch ``sparse`` 参数表示是否使用稀疏更新,MegEngine 无此参数。 initial_weight -~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~ MegEngine ``initial_weight`` 初始化该模块的可学习权重,形状为(num_embeddings, embedding_dim) ,PyTorch 中无此参数。 .. code-block::: python diff --git a/source/reference/comparison/max_pool2d.rst b/source/reference/comparison/max_pool2d.rst index 75022f0dca2..c82807a04fb 100644 --- a/source/reference/comparison/max_pool2d.rst +++ b/source/reference/comparison/max_pool2d.rst @@ -42,7 +42,7 @@ Max_Pool2d 差异对比 -------- dilation 参数 -~~~~~~~~~~~~~ +~~~~~~~~~~~~ Pytorch 中有 ``dilation`` ,MegEngine 中无此参数,该参数用于窗口的元素间隔控制; return_indices 参数 @@ -55,7 +55,6 @@ ceil_mode 参数 PyTorch 中有 ceil_mode 参数,MegEngine 无此参数,该参数为 True 时表示在计算输出形状的过程中采用向上取整的操作,为 False 时,采用向下取整。 .. code-block:: python - import megengine import torch import numpy as np diff --git a/source/user-guide/model-development/data/index.rst b/source/user-guide/model-development/data/index.rst index 58d3e7933bb..6e1d1cc04fb 100644 --- a/source/user-guide/model-development/data/index.rst +++ b/source/user-guide/model-development/data/index.rst @@ -1,5 +1,4 @@ .. _data-guide: - .. currentmodule:: megengine =========================== diff --git a/source/user-guide/model-development/jit/xla.rst b/source/user-guide/model-development/jit/xla.rst index 04f6d2b0e09..e56afc94d5a 100644 --- a/source/user-guide/model-development/jit/xla.rst +++ b/source/user-guide/model-development/jit/xla.rst @@ -35,14 +35,6 @@ mge_xlalib 安装命令如下: # cuda 11.8 python3 -m pip install mge-xlalib==0.4.7+cuda11080.cudnn860 -f https://www.megengine.org.cn/whl/mge.html -xla 在编译优化时需要使用 nvptx 等工具进行运行时编译,所以我们需要在环境中安装相关依赖等,对于 cuda 11.8,nvidia 已经支持 pip 安装 - -.. code-block:: shell - - pip install "nvidia-cuda-cupti-cu11>=11.8" "nvidia-cuda-nvcc-cu11>=11.8" "nvidia-cuda-runtime-cu11>=11.8" - -对于 cuda 11.1 和 cuda 11.4,则需要手动自行安装 cuda,并把 cuda/bin 等目录加入 PATH 中。故而从性能和使用便利性上来说,如果想使用 mge-xla,更推荐使用 cuda 11.8。 - XLA 编译器的使用方式与 MegEngine graph runtime 自带编译器类似, 需要用MegEngine提供的装饰器 (xla_trace) 对训练函数进行包装。 函数执行第一遍时会记录算子执行序列,以捕获静态图。 后续执行会把静态图用XLA编译, 并调用编译好的 XLA executable 加速训练过程。 @@ -93,95 +85,36 @@ XLA executable 加速训练过程。 print (xla_fused_softmax(inp)) # run in xla -如果我们想看到 mge 和 xla 优化的一些中间 IR 表示,可以通过设置环境变量 MGE_VERBOSE_XLA_IR 来打印相关结果。MGE_VERBOSE_XLA_IR 为 1 时,会打印 mge trace 出来的图 IR,MGE_VERBOSE_XLA_IR 为 2 时,会打印xla 的 hlo 图结构,在 MGE_VERBOSE_XLA_IR 为 3 时会打印 xla 编译优化后的图结构。如果我们 export MGE_VERBOSE_XLA_IR=1 后再执行上述代码,则可以看到: - -.. code-block:: python - - please_realize_func_name_system_1( - 0%:<256x1000x1000,f32> - ) { - 1%:<256x1000x1000,f32> = io_mark_var(0%:<256x1000x1000,f32>) - 2%:<256x1000x1,f32> = ReduceMAX(1%:<256x1000x1000,f32>) - 3%:<256x1000x1000,f32> = SUB(1%:<256x1000x1000,f32>, 2%:<256x1000x1,f32>) - 4%:<256x1000x1000,f32> = EXP(3%:<256x1000x1000,f32>) - 5%:<256x1000x1,f32> = ReduceSUM(4%:<256x1000x1000,f32>) - 6%:<256x1000x1000,f32> = TRUE_DIV(4%:<256x1000x1000,f32>, 5%:<256x1000x1,f32>) - 7%:<256x1000x1000,f32> = io_mark_var(6%:<256x1000x1000,f32>) - return 1 7%:<256x1000x1000,f32> - } - 当模型训练迭代(Iteration)完全静态的情况下, 您也可以使用 jit.xla_trace 装饰器将训练迭代全部交由XLA执行。 需要将 optimizer, module 作为train_func 参数传入,同时 train_func 中需包含包含模型前向、 反向 、 参数更新等代码, 代码示例如下: .. code-block:: python - - :emphasize-lines: 44-51, 58 - - from functools import partial - import numpy as np - - import megengine - import megengine.autodiff as autodiff - import megengine.functional as F - import megengine.module as M - from megengine import distributed as dist - from megengine.jit import partial_trace, xla_trace - from megengine.optimizer import AdamW - - class ConvNet(M.Module): - def __init__(self): - super().__init__() - self.conv1 = M.Conv2d(3, 6, 5, bias=False) - self.bn1 = M.BatchNorm2d(6) - self.conv2 = M.Conv2d(6, 16, 5, bias=False) - self.bn2 = M.BatchNorm2d(16) - self.fc1 = M.Linear(16 * 5 * 5, 120) - self.fc2 = M.Linear(120, 84) - self.classifier = M.Linear(84, 10) - self.pool = M.AvgPool2d(2, 2) - - def forward(self, x): - x = self.pool(self.bn1(self.conv1(x))) - x = self.pool(self.bn2(self.conv2(x))) - x = F.flatten(x, 1) - x = self.fc1(x) - x = self.fc2(x) - x = self.classifier(x) - return x - - @dist.launcher(n_gpus=2, device_type="gpu") - def worker(): - def runner(): - model = ConvNet() - model.train() - dist.bcast_list_(model.tensors()) - - cblist = [dist.make_allreduce_cb("mean")] - gm = autodiff.GradManager().attach(model.parameters(), callbacks=cblist) - optimizer = AdamW(model.parameters(), lr=0.01) - - @xla_trace(without_host=True, capture_as_const=True) - def func(model, optimizer, timage, tlabel): - with gm: - score = model(timage) - loss = F.nn.cross_entropy(score, tlabel) - gm.backward(loss) - optimizer.step().clear_grad() - return loss - - image = np.random.randn(3, 8, 3, 32, 32) - label = np.random.randint(0, 10, (3, 8,)) - for i in range(6): - timage = megengine.Tensor(image[i % 3]) - tlabel = megengine.Tensor(label[i % 3]) - loss = func(model, optimizer, timage, tlabel) - print(loss) - - runner() - - worker() + :emphasize-lines: 3-12, 20 + + from megengine.jit import xla_trace + + @xla_trace(capture_as_const=True) #capture_as_const为True时, 所有不在train_func 参数列表里的外部Tensor会被当成常量捕获 + def train_func(data, label, *, opt, net): + gm = GradManager() + gm.attach(net.parameters()) + with gm: + logits = net(data) + loss = F.loss.cross_entropy(logits, label) + gm.backward(loss) + opt.step().clear_grad() + return loss + + for epoch in range(total_epochs): + total_loss = 0 + for step, (batch_data, batch_label) in enumerate(dataloader): + data = mge.tensor(batch_data) + label = mge.tensor(batch_label) + + loss = train_func(data, label, opt=optimizer, net=model) + total_loss += loss.numpy().item() + print("epoch: {}, loss {}".format(epoch, total_loss/len(dataloader))) .. _partial_trace: @@ -190,55 +123,30 @@ XLA executable 加速训练过程。 模型训练迭代中存在动态执行逻辑的情况下, 无法将整个计算交由 XLA 执行。 这种情况下可以使用 jit.patrial_trace 装饰器对其中静态的部分进行加速。 - 被 partial_trace 包装部分的前向/反向会使用 XLA 执行, 其他部分仍由 MegEngine 执行。 代码示例如下: .. code-block:: python - - :emphasize-lines: 12-27 - - @dist.launcher(n_gpus=2, device_type="gpu") - def worker(): - def runner(): - model = ConvNet() - model.train() - dist.bcast_list_(model.tensors()) - - cblist = [dist.make_allreduce_cb("mean")] - gm = autodiff.GradManager().attach(model.parameters(), callbacks=cblist) - optimizer = AdamW(model.parameters(), lr=0.01) - - model.forward = partial( - partial_trace( - func=type(model).forward, - backend="xla", - capture_as_const=True, - ), - model, - ) - optimizer._updates = partial( - partial_trace( - func=type(optimizer)._updates, - backend="xla", - capture_as_const=True, - ), - optimizer, - ) - - image = np.random.randn(3, 8, 3, 32, 32) - label = np.random.randint(0, 10, (3, 8,)) - for i in range(6): - timage = megengine.Tensor(image[i % 3]) - tlabel = megengine.Tensor(label[i % 3]) - with gm: - score = model(timage) - loss = F.nn.cross_entropy(score, tlabel) - gm.backward(loss) - optimizer.step().clear_grad() - print(loss) - - runner() - - worker() + :emphasize-lines: 3-5, 15 + + from megengine.jit import partial_trace + + @partial_trace(backend="xla", capture_as_const=True) + def backbone(model, inp): + return model(inp) + + for epoch in range(total_epochs): + total_loss = 0 + gm = GradManager() + gm.attach(net.parameters()) + for step, (batch_data, batch_label) in enumerate(dataloader): + data = mge.tensor(batch_data) + label = mge.tensor(batch_label) + with gm: + logits = backbone(net, data) + loss = F.loss.cross_entropy(logits, label) + gm.backward(loss) + opt.step().clear_grad() + total_loss += loss.numpy().item() + print("epoch: {}, loss {}".format(epoch, total_loss/len(dataloader)))